/*
 * Copyright (c) 1997 Massachusetts Institute of Technology
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to use, copy, modify, and distribute the Software without
 * restriction, provided the Software, including any modified copies made
 * under this license, is not distributed for a fee, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
 * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * Except as contained in this notice, the name of the Massachusetts
 * Institute of Technology shall not be used in advertising or otherwise
 * to promote the sale, use or other dealings in this Software without
 * prior written authorization from the Massachusetts Institute of
 * Technology.
 *  
 */

#include <stdio.h>
#include <stdlib.h>

#include <math.h>

#include "bench_utils.h"
#include "bench_1d_protos.h"
#include "bench_3d_protos.h"
#include "bench_ffts.h"

extern int check_prime_factors(int n, int maxprime);

/* Skeleton for typical routine to perform benchmark for an fft:

void do_foo_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
                FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
                int size_arr, int size_work,
                short compute_accuracy, factor_type allowed_factors)
{
     if (rank not okay) return;
     FFT_NAME("Foo");
     if (N != 0) {
          check if N is okay for foo...
          init foo fft...
     }
     if (N is okay for foo...)
          DO_BENCHMARK_ND(rank,n,N,...)
     else
          SKIP_BENCHMARK("could not handle this N because...")
}

This routine should be called from bench_1d/bench_3d, inserted in
alphabetical order by "Foo".

For Fortran FFTs, the body of the do_foo_fft function should be bracketed
by #ifdef HAVE_F77 ... #endif directives.

*/

void do_arndt_ffts(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     int log2N = 0;

     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;     
     FFT_NAME("Arndt DIF");
     
     if (N > 0 && FFT_OK) {
	  int n1 = N;
	  while (n1 % 2 == 0) {
	       n1 /= 2;
	       log2N += 1;
	  }
     }

     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     dif4_fft((FFTW_REAL*)arr, 
			      (FFTW_REAL*)arr + N, log2N, -1),
		     1.0, -1, 0,
		     dif4_fft((FFTW_REAL*)arr, 
			      (FFTW_REAL*)arr + N, log2N, +1),
		     1.0/N,
		     compute_accuracy);

     FFT_REQUIRE_POWER_OF_TWO;     
     FFT_NAME("Arndt DIT");

     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     dit4_fft((FFTW_REAL*)arr, 
			      (FFTW_REAL*)arr + N, log2N, -1),
		     1.0, -1, 0,
		     dit4_fft((FFTW_REAL*)arr, 
			      (FFTW_REAL*)arr + N, log2N, +1),
		     1.0/N,
		     compute_accuracy);

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Arndt Split-Radix");

     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     duhamel_fft((FFTW_REAL*)arr, 
				 (FFTW_REAL*)arr + N, log2N, -1),
		     1.0, -1, 0,
		     duhamel_fft((FFTW_REAL*)arr, 
				 (FFTW_REAL*)arr + N, log2N, +1),
		     1.0/N,
		     compute_accuracy);

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Arndt 4-step");

     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     four_step_fft((FFTW_REAL*)arr, 
				   (FFTW_REAL*)arr + N, log2N, -1),
		     1.0, -1, 0,
		     four_step_fft((FFTW_REAL*)arr, 
				   (FFTW_REAL*)arr + N, log2N, +1),
		     1.0/N,
		     compute_accuracy);
}

void do_bailey_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) != sizeof(double)) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bailey");

     if (FFT_OK) {
	  int n1 = N, m = 0, is1 = -1, is2 = +1;
	  FFTW_REAL  *u = (FFTW_REAL *) (work + N);
	  
	  if (N != 0) {
	       while (n1 % 2 == 0) {
		    ++m;
		    n1 /= 2;
	       }
	       FORTRANIZE(mpinix,MPINIX)(&m,u);
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(mpcfft,MPCFFT)(&is1,&m,arr,work,u),
			  1.0, -1, 0,
			  FORTRANIZE(mpcfft,MPCFFT)(&is2,&m,arr,work,u),
			  1.0/N,
			  compute_accuracy);	  
     }
#endif
}

void do_beauregard_fft(int rank, int *n, int *n_rev, int N, 
		       short is_power_of_two,
		       FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Beauregard");

     if (FFT_OK) {
	  int log2n = 0;
	  
	  if (N != 0) {
	       int np = N;
	       while (np > 1) {
		    np >>= 1;
		    log2n += 1;
	       }
	  }
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  BeauregardFFT(arr, log2n, 0),
			  1.0, -1, 1,
			  BeauregardFFT(arr, log2n, 1),
			  1.0,
			  compute_accuracy);
     }
}

void do_bergland_fft(int rank, int *n, int *n_rev, int N, 
		     short is_power_of_two,
		     FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		     int size_arr, int size_work,
		     short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bergland");

     if (FFT_OK && N <= (1L << 20))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  bergland_fft1d(arr,N),
			  1.0, +1, 1,
			  0,
			  1.0/N,
			  -compute_accuracy);
     else if (N > (1L << 20))
	  skip_benchmark("doesn't work for N > 2^20");
}

typedef void bfft_func(register FFTW_COMPLEX *x);
typedef void (*bfft_func_p)(register FFTW_COMPLEX *x);

#define fftc4_un2 fftc4_2
#define fftc8_un2 fftc8_2

#define DECL_BFFT(prefix) \
bfft_func prefix##4, prefix##8, prefix##16, prefix##32, \
     prefix##64, prefix##128, prefix##256, prefix##512, prefix##1024

#define GET_BFFT(func,prefix) \
switch(N) { \
    case 2: func = prefix##2; break; \
    case 4: func = prefix##4; break; \
    case 8: func = prefix##8; break; \
    case 16: func = prefix##16; break; \
    case 32: func = prefix##32; break; \
    case 64: func = prefix##64; break; \
    case 128: func = prefix##128; break; \
    case 256: func = prefix##256; break; \
    case 512: func = prefix##512; break; \
    case 1024: func = prefix##1024; break; \
}

void do_bernstein_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_BERNSTEIN
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bernstein (out-of-order) (v0.60)");

     if (FFT_OK && N <= 1024 && N != 1) {
	  bfft_func fftc4_2, fftc8_2;
	  DECL_BFFT(fftc4_);
	  DECL_BFFT(fftc4_un);
	  DECL_BFFT(fftc8_);
	  DECL_BFFT(fftc8_un);
	  bfft_func_p bfft = 0, bifft = 0;

	  if (N != 0) {
	       if (sizeof(double) == sizeof(FFTW_REAL)) {
		    GET_BFFT(bfft,fftc8_);
		    GET_BFFT(bifft,fftc8_un);
	       }
	       else {
		    GET_BFFT(bfft,fftc4_);
		    GET_BFFT(bifft,fftc4_un);
	       }
	  }

	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  bfft(arr),
			  1.0, -1, 
			  -1,  /* don't check correctness since out-of-order */
			  bifft(arr),
			  1.0/N,
			  compute_accuracy);
     }
     else if (N > 1024)
	  skip_benchmark("can only handle N <= 1024");
     else if (N == 1)
	  skip_benchmark("can't handle N == 1");
#endif
}

void do_bloodworth_fft(int rank, int *n, int *n_rev, int N, 
		       short is_power_of_two,
		       FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Bloodworth");
     if (!FFT_OK) return;

     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     Bloodworth_Q2_FwdFFT(arr,N),
		     1.0, +1, 1,
		     Bloodworth_Q2_RevFFT(arr,N),
		     1.0,
		     compute_accuracy);
}

void do_brenner_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int n1 = N, n2, rank = 1, iform = 1, is1 = -1, is2 = +1;
	  
	  FFT_NAME("Brenner");

	  /* There seems to be a bug in Brenner for N=3*2^m.  
	     Any other N seems to work fine!  Ugh... */
	  n2 = N;
	  if (N != 0) while (n2 % 2 == 0) n2 /= 2;
	  if (n2 != 3)
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(brennerfft,
					  BRENNERFFT)(arr,&n1,&rank,&is1,
						       &iform,work),
			       1.0, -1, 1,
			       FORTRANIZE(brennerfft,
					  BRENNERFFT)(arr,&n1,&rank,&is2,
						       &iform,work),
			       1.0/N,
			       compute_accuracy);
	  else
	       skip_benchmark("has a bug for N=3*2^m");
     }
#endif
}

void do_burrus_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
if (sizeof(FFTW_REAL) == sizeof(double)) {
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Burrus");

     if (FFT_OK && N != 1) {
	  int n1 = N, m = 0, is1 = -1, is2 = +1;
	  FFTW_REAL  *u = (FFTW_REAL *) (work + N);
	  
	  if (N != 0) {
	       while (n1 % 2 == 0) {
		    ++m;
		    n1 /= 2;
	       }
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(burruscfft,BURRUSCFFT)
			       ((FFTW_REAL*)arr,(FFTW_REAL*)arr + N,
				&N,&m,&is2),
			  1.0, -1, 0,
			  FORTRANIZE(burruscfft,BURRUSCFFT)
			       ((FFTW_REAL*)arr,(FFTW_REAL*)arr + N,
				&N,&m,&is1),
			  1.0,
			  compute_accuracy);	  
     }
     else if (N == 1)
	  skip_benchmark("Burrus can't handle N == 1");
}
#endif
}

void do_cwp_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
                FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
                int size_arr, int size_work,
                short compute_accuracy, factor_type allowed_factors)
{
     int np = N;
     
     if (rank != 1) return;

     FFT_NAME("CWP (min N)");
     if (N != 0) np = npfa(N);
     if (np >= N && np <= size_arr)
	  DO_BENCHMARK_ND(rank, &np, N, arr, arr,
			  pfacc(-1, np, arr), 1.0, -1, -1,
			  pfacc(+1, np, arr), 1.0/np,
			  compute_accuracy);
     else if (np < N)
	  skip_benchmark("this transform size is too big for CWP");
     else
	  skip_benchmark("array isn't big enough for CWP!");
     
     FFT_NAME("CWP (best N)");
     if (N != 0) np = npfao(N, size_arr);
     if (np >= N && np <= size_arr)
	  DO_BENCHMARK_ND(rank, &np, N, arr, arr,
			  pfacc(-1, np, arr), 1.0, -1, -1,
			  pfacc(+1, np, arr), 1.0/np,
			  compute_accuracy);
     else if (np < N)
	  skip_benchmark("this transform size is too big for CWP");
     else
	  skip_benchmark("array isn't big enough for CWP!");
}

void do_edelblute_fft(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Edelblute");
     if (!FFT_OK) return;

     if (N == 0 || N > 2)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  fft_duhamel((FFTW_REAL*)arr, 
				      (FFTW_REAL*)arr + N, N),
			  1.0, -1, 0,
			  0,
			  1.0/N,
			  -compute_accuracy);
     else if (N == 2 || N == 1)
	  skip_benchmark("Edelblute can't handle N <= 2");
}

void do_emayer_fft(int rank, int *n, int *n_rev, int N,
		   short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef USE_EMAYER
#ifdef HAVE_F90
     if (sizeof(FFTW_REAL) == sizeof(double)) {
          int two_n = 2*N;
	  if (rank != 1) return;

	  FFT_REQUIRE_POWER_OF_TWO;
	  FFT_NAME("EMayer");
	  if (!FFT_OK) return;
	  if (N==0 || two_n >= (1<<4) && two_n <= (1<<19))
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(emayerfft,EMAYERFFT)(arr,&two_n),
			       1.0, +1, 1,
			       FORTRANIZE(emayerifft,EMAYERIFFT)(arr,&two_n),
			       1.0/N,
			       compute_accuracy);
	  else
	       skip_benchmark("EMayer can only handle 2^3 <= N <= 2^18");
     }
#endif
#endif
}

void do_fftpack_fft(int rank, int *n, int *n_rev, int N, short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     int n1 = N;
     if (rank != 1) return;

     FFT_NAME("FFTPACK");
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  if (N != 0) FORTRANIZE(dcffti,DCFFTI)(&n1, work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(dcfftf,DCFFTF)(&n1, arr, work),
			  1.0, -1, 1,
			  FORTRANIZE(dcfftb,DCFFTB)(&n1, arr, work),
			  1.0/N,
			  compute_accuracy);
     }
     else if (sizeof(FFTW_REAL) == sizeof(float)) {
	  if (N != 0) FORTRANIZE(scffti,SCFFTI)(&n1, work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(scfftf,SCFFTF)(&n1, arr, work),
			  1.0, -1, 1,
			  FORTRANIZE(scfftb,SCFFTB)(&n1, arr, work),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}


void do_fftpack_f2c_fft(int rank, int *n, int *n_rev, int N, 
			short is_power_of_two,
			FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
			int size_arr, int size_work,
			short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_NAME("FFTPACK (f2c)");
     if (N != 0) fftpack_cffti(N, work);
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     fftpack_cfftf(N, arr, work),
		     1.0, -1, 1,
		     fftpack_cfftb(N, arr, work),
		     1.0/N,
		     compute_accuracy);     	  

}

short do_fftw_1d_fft(int rank, int *n, int *n_rev, int N, 
		     short is_power_of_two,
		     FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		     int size_arr, int size_work,
		     short compute_accuracy, factor_type allowed_factors)
{
     fftw_plan p = 0, p_inv = 0, p_est = 0;
     int wis_flags = 0;

     #ifdef FFTW_HAS_WISDOM /* if we are using a version of FFTW with the
			       "wisdom" feature, then take advantage of it */
     wis_flags = FFTW_USE_WISDOM;
     #endif
     
     if (rank != 1) return 0;

     FFT_NAME("FFTW");

     /* don't bother creating plan if we won't do the FFT */
     if (fft_data_cur) {
	  if (N != 0) {
#ifdef FFTW_HAS_PLAN_SPECIFIC
	       p = fftw_create_plan_specific(N,FFTW_FORWARD,
					     FFTW_MEASURE|wis_flags,
					     arr,1,work,1);
	       p_inv = fftw_create_plan_specific(N,FFTW_BACKWARD,
						 FFTW_ESTIMATE,
						 work,1,arr,1);
#else
	       p = fftw_create_plan(N,FFTW_FORWARD,FFTW_MEASURE|wis_flags);
	       p_inv = fftw_create_plan(N,FFTW_BACKWARD,FFTW_ESTIMATE);
#endif
	       if (!p || !p_inv) {
		    log_printf("\n\nError creating FFTW plan!\n");
		    printf("\n\nError creating FFTW plan!\n");
		    return 1;
	       }
#ifdef FFTW_HAS_FPRINT_PLAN
	       if (bench_log_file) {
		    log_printf("\nFFTW_MEASURE ");
		    fftw_fprint_plan(bench_log_file,p);
		    log_printf("\n");
	       }
#endif
	  }
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, work,
			  fftw(p,1,arr,1,0,work,1,0), 
			  1.0, FFTW_FORWARD, 1,
			  fftw(p_inv,1,work,1,0,arr,1,0), 1.0/N,
			  compute_accuracy);

	  if (N != 0)
	       fftw_destroy_plan(p);
     }

     FFT_NAME("FFTW_ESTIMATE");

     /* don't bother creating plan if we won't do the FFT */
     if (fft_data_cur) {
	  if (N != 0) {
#ifdef FFTW_HAS_PLAN_SPECIFIC
	       if (!p_inv)
		    p_inv = fftw_create_plan_specific(N,FFTW_BACKWARD,
						      FFTW_ESTIMATE,
						      work,1,arr,1);
	       p_est = fftw_create_plan_specific(N,FFTW_FORWARD,
						 FFTW_ESTIMATE,
						 arr,1,work,1);
#else
	       if (!p_inv)
		    p_inv = fftw_create_plan(N,FFTW_BACKWARD,FFTW_ESTIMATE);
	       p_est = fftw_create_plan(N,FFTW_FORWARD,FFTW_ESTIMATE);
#endif
	       if (!p_est || !p_inv)
		    return 666;
#ifdef FFTW_HAS_FPRINT_PLAN
	       if (bench_log_file) {
		    log_printf("\nFFTW_ESTIMATE ");
		    fftw_fprint_plan(bench_log_file,p_est);
		    log_printf("\n");
	       }
#endif
	  }
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, work,
			  fftw(p_est,1,arr,1,0,work,1,0), 
			  1.0, FFTW_FORWARD, 1,
			  fftw(p_inv,1,work,1,0,arr,1,0), 1.0/N,
			  compute_accuracy);
     }

     if (p_inv)
	  fftw_destroy_plan(p_inv);
     if (p_est)
	  fftw_destroy_plan(p_est);

     return 0;
}

void do_frigo_fft(int rank, int *n, int *n_rev, int N, 
		  short is_power_of_two,
		  FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		  int size_arr, int size_work,
		  short compute_accuracy, factor_type allowed_factors)
{
     int factors[40];
     if (rank != 1) return;

     FFT_NAME("Frigo-old");
     if (N != 0) cilk_many_fft_init(N, work, factors);
     DO_BENCHMARK_ND(rank, n, N, arr, work,
		     cilk_fft_aux(N, arr, work, factors, 
				  work + N, N),
		     1.0, -1, 1,
		     0, 1.0,
		     -compute_accuracy);
}

void do_monnier_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
     int err = 0;

     if (rank != 1) return;

     FFT_NAME("Monnier");
     if (N != 0) 
	  err = ouvre_fft(N, 1.0);

     if (err == 0)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  monnier_fft(arr),
			  1.0, 1, 1,
			  { ferme_fft();ouvre_fft(N, -1.0);monnier_fft(arr); },
			  1.0,
			  compute_accuracy);
     else
	  skip_benchmark("error initializing data");

     if (N != 0 && err == 0)
	  ferme_fft();
}

void do_temperton_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_NAME("Temperton (f2c)");
     if (check_prime_factors(N,5) && (N == 0 || 0 == setgpfa(work,N)))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  gpfa((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
			       work, 2, 2 * N, N, 1, -1),
			  1.0, -1, 1,
			  gpfa((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
			       work, 2, 2 * N, N, 1, +1),
			  1.0/N,
			  compute_accuracy);
     else
	  skip_benchmark("Temperton only handles N = 2^m 3^n 5^q");
}

void do_temperton_f_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int np = 0, two = 2, twoN = 2*N, one = 1, is1=-1,is2=+1;	  
	  if (rank != 1) return;

	  FFT_NAME("Temperton");
	  if (N != 0) {
	       /* insure that array size is power of 2, 3, & 5 only: */
	       if (check_prime_factors(N,5)) {
		    np = 0;
		    FORTRANIZE(gpfafsetgpfa,GPFAFSETGPFA)(work,&N);
	       }
	       else
		    np = 1;
	  }
	  if (np == 0)
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(gpfafgpfa,GPFAFGPFA)
			           ((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
				    work, &two, &twoN, &N, &one, &is1),
			       1.0, -1, 1,
			       FORTRANIZE(gpfafgpfa,GPFAFGPFA)
			           ((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
				    work, &two, &twoN, &N, &one, &is2),
			       1.0/N,
			       compute_accuracy);
	  else skip_benchmark("Temperton only handles N = 2^m 3^n 5^q");
     }
#endif
}

void do_green_fft(int rank, int *n, int *n_rev, int N, 
		short is_power_of_two,
                FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
                int size_arr, int size_work,
                short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     /* green is buggy with 64 bit pointers */
     if (sizeof (void *) != sizeof(int)) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Green");
     if (!FFT_OK) return;

     {
	  int M = 0;
	  
	  if (N > 0) {
	       int tmpN = N;
	       while (tmpN > 1) {
		    M += 1;
		    tmpN /= 2;
	       }
	       if (0 != green_fftInit(M))
		    M = -1;
	  }
	  
	  if (M >= 0) {
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       green_ffts(arr, M, 1),
			       1.0, -1, 1,
			       green_iffts(arr, M, 1),
			       1.0,
			       compute_accuracy);
	       if (N > 0)
		    green_fftFree();
	  }
	  else
	       skip_benchmark("Green can't handle this size.");
     }
}

void do_green_3d_fft(int rank, int *n, int *n_rev, int N, 
		     short is_power_of_two,
		     FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		     int size_arr, int size_work,
		     short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 3) return;
     /* green is buggy with 64 bit pointers */
     if (sizeof (void *) != sizeof(int)) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Green");
     if (!FFT_OK) return;

     {
	  int M[3] = {0,0,0};
	  int err = 0;
	  
	  if (N != 0) {
	       int dim;
	       
	       for (dim = 0; dim < 3; ++dim) {
		    int tmpN = n[dim];
		    while (tmpN > 1) {
			 M[dim] += 1;
			 tmpN /= 2;
		    }
	       }
	       err = green_fft3dInit(M[0],M[1],M[2]);
	  }
	  
	  if (err == 0) {
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       green_fft3d(arr, M[0],M[1],M[2]),
			       1.0, -1, 1,
			       green_ifft3d(arr, M[0],M[1],M[2]),
			       1.0,
			       compute_accuracy);
	       if (N != 0)
		    green_fft3dFree();
	  }
	  else
	       skip_benchmark("Green can't handle this size.");
     }
}

void do_gsl_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     gsl_fft_complex_wavetable wt;
     int status = 0;

     if (rank != 1) return;

     /******* Mixed-radix routine ********/

     FFT_NAME("GSL");
     if (N != 0) {
	  /* Initialize the wavetable to point to our work array,
	     rather than reallocating.  Yes, this breaks the abstraction,
	     but it saves us memory. */
	  
	  wt.scratch = (complex*)work;
	  wt.trig = (complex*)work + N;
	  
	  /* Now, initialize: */
	  status = gsl_fft_complex_init(N,&wt);
     }
     
     if (status == 0)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  gsl_fft_complex_forward((complex*)arr,N,&wt), 
			  1.0, -1, 1,
			  gsl_fft_complex_backward((complex*)arr,N,&wt),
			  1.0/N,
			  compute_accuracy);
     else if (N != 0)
	  skip_benchmark("Error initializing GSL wavetable!");

     /****** Radix 2 DIT and DIF routines ********/

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("GSL DIT");
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     gsl_fft_complex_radix2_forward((complex*)arr,N), 
		     1.0, -1, 1,
		     gsl_fft_complex_radix2_backward((complex*)arr,N),
		     1.0/N,
		     compute_accuracy);

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("GSL DIF");
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     gsl_fft_complex_radix2_dif_forward((complex*)arr,N),
		     1.0, -1, 1,
		     gsl_fft_complex_radix2_dif_backward((complex*)arr,N),
		     1.0/N,
		     compute_accuracy);
}

void do_krukar_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Krukar");
     if (!FFT_OK) return;
     if (N <= 4096 && N != 1)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  fft_in_C(arr, N), 1.0, -1, 1,
			  ifft_in_C(arr, N), 1.0,
			  compute_accuracy);
     else if (N > 4096)
	  skip_benchmark("can't handle N > 4096");
     else if (N == 1)
	  skip_benchmark("can't handle N == 1");
}

void do_rmayer_fft(int rank, int *n, int *n_rev, int N, 
		short is_power_of_two,
                FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
                int size_arr, int size_work,
                short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("RMayer (Buneman)");
     if (FFT_OK) {
     if ((N == 0 || N > 2) && N <= (1<<19))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
                          mayer_fft_1(N, (FFTW_REAL*)arr, (FFTW_REAL*)arr + N),
                          1.0, -1, 0,
                          mayer_ifft_1(N, (FFTW_REAL*)arr,(FFTW_REAL*)arr + N),
                          1.0,
                          compute_accuracy);
     else if (N <= 2)
	  skip_benchmark("can't handle N <= 2");
     else
	  skip_benchmark("can't handle N > 2^19");
     }

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("RMayer (simple)");
     if (FFT_OK) {
     if (N == 0 || N > 2)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
                          mayer_fft_5(N, (FFTW_REAL*)arr, (FFTW_REAL*)arr + N),
                          1.0, -1, -1,
                          mayer_ifft_5(N, (FFTW_REAL*)arr,(FFTW_REAL*)arr + N),
                          1.0,
                          compute_accuracy);
     else
	  skip_benchmark("can't handle N <= 2");
     }

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("RMayer (lookup)");
     if (FFT_OK) {
     if (N == 0 || N > 2) {
	  extern FFTW_REAL *mayer_hugetrigtab;
	  mayer_hugetrigtab = (FFTW_REAL*)work;
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
                          mayer_fft_7(N, (FFTW_REAL*)arr, (FFTW_REAL*)arr + N),
                          1.0, -1, 0,
                          mayer_ifft_7(N, (FFTW_REAL*)arr,(FFTW_REAL*)arr + N),
                          1.0,
                          compute_accuracy);
     }
     else
	  skip_benchmark("can't handle N <= 2");
     }
}

void do_monro_fft(int rank, int *n, int *n_rev, int N, 
		  short is_power_of_two,
		  FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		  int size_arr, int size_work,
		  short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
if (sizeof(FFTW_REAL) == sizeof(double)) {
     int mN = -N;
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Monro");
     if (!FFT_OK) return;
     if ((N == 0 || N > 2) && N <= 1048576)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(monrofastf,MONROFASTF)
			  ((FFTW_REAL*)arr, (FFTW_REAL*)arr + N,&N),
                          1.0*N, -1, 0,
			  FORTRANIZE(monrofastf,MONROFASTF)
			  ((FFTW_REAL*)arr, (FFTW_REAL*)arr + N,&mN),
                          1.0,
                          compute_accuracy);
     else if (N <= 2)
	  skip_benchmark("Monro can't handle N <= 2");
     else
	  skip_benchmark("Monro can't handle N > 2^20");
}
#endif
}

void do_napack_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO; /* NAPACK is buggy for non powers of two */
     FFT_NAME("NAPACK (f2c)");
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     napack_fft(arr, &N, work),
		     1.0, +1, 1,
		     0,
		     1.0/N,
		     -compute_accuracy);
}

void do_nielsen_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NIELSEN
     if (rank != 1) return;
     FFT_NAME("Nielsen");
     DO_BENCHMARK_ND(rank, n, N, arr, work,
		     mixfft(N, (FFTW_REAL*)arr, ((FFTW_REAL*)arr) + N,
			    (FFTW_REAL*)work, ((FFTW_REAL*)work) + N),
		     1.0, -1, 0,
		     0, 1.0,
		     -compute_accuracy);
#endif
}

void do_nrc_1d_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NRC
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (C)");
     if (!FFT_OK) return;
     {
	  FFTW_REAL *arr2 = ((FFTW_REAL*)arr) - 1; /* NRC is 1-based! */
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  nrc_four1(arr2, N, -1),
			  1.0, -1, 1,
			  nrc_four1(arr2, N, +1),
			  1.0/N,
			  compute_accuracy);     	
     }
#endif
}

void do_nrf_1d_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NRF
#ifdef HAVE_F77
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) != sizeof(double)) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (F)");
     if (!FFT_OK) return;
     {
	  int n1=N, is1=-1,is2=+1;
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(nrffour1,NRFFOUR1)(arr, &n1, &is1),
			  1.0, -1, 1,
			  FORTRANIZE(nrffour1,NRFFOUR1)(arr, &n1, &is2),
			  1.0/N,
			  compute_accuracy);     	
     }
#endif
#endif
}

void do_ooura_fft(int rank, int *n, int *n_rev, int N, 
		  short is_power_of_two,
		  FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		  int size_arr, int size_work,
		  short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Ooura (C)");
     if (FFT_OK) {
	  int *ip = 0, len_ip, n2=2*N, is1=-1, is2=+1;
	  
	  if (N != 0) {
	       if (N % 4 == 0)
		    len_ip = sqrt(N) + 3;
	       else
		    len_ip = sqrt(N/2) + 3;
	       ip = (int*) fftw_malloc(sizeof(int)*len_ip);
	       if (!ip) {
		    printf("\nERROR!  Out of memory!\n");
		    exit(1);
	       }
	       ip[0] = 0;
	       ooura_c_cdft(n2, +1, arr, ip, work);
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  ooura_c_cdft(n2, +1, arr, ip, work),
			  1.0, +1, 1,
			  ooura_c_cdft(n2, -1, arr, ip, work),
			  1.0/N,
			  compute_accuracy);
#ifdef HAVE_F77
          if (sizeof(FFTW_REAL) == sizeof(double)) {
	  FFT_REQUIRE_POWER_OF_TWO;
	  FFT_NAME("Ooura (F)");

	  if (FFT_OK && N != 0) {
	       ip[0] = 0;
	       FORTRANIZE(oourafcdft,OOURAFCDFT)(&n2, &is2, arr, 
						     ip, work);
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(oourafcdft,
				     OOURAFCDFT)(&n2, &is2, arr, 
						   ip, work),
			  1.0, +1, 1,
			  FORTRANIZE(oourafcdft,
				     OOURAFCDFT)(&n2, &is1, arr, 
						   ip, work),
			  1.0/N,
			  compute_accuracy);
	  }
#endif
	  if (FFT_OK && N != 0)
	       fftw_free(ip);
     }
}

void do_qft_fft(int rank, int *n, int *n_rev, int N,
		short is_power_of_two,
		FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		int size_arr, int size_work,
		short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     /* QFT is buggy with 64 bit pointers */
     if (sizeof (void *) != sizeof(int)) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("QFT");
     if (!FFT_OK) return;

     if (N == 0 || N >= 16) {
	  if (N != 0)
	       qftinit(N,(void *) (work + N));
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, work,
			  cfqft((FFTW_REAL*)work,
				N + (FFTW_REAL*)work,
				(FFTW_REAL*)arr,
				N + (FFTW_REAL*)arr,
				N,N),
			  1.0, -1, 0,
			  0,
			  1.0/N,
			  -compute_accuracy);
     }
     else
	  skip_benchmark("QFT requires N >= 16");
}

void do_ransom_fft(int rank, int *n, int *n_rev, int N,
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Ransom");
     if (!FFT_OK) return;
     if (N != 2 && N != 1)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  tablesixstepfft(arr,N,-1),
			  1.0, -1, 1,
			  tablesixstepfft(arr,N,+1),
			  1.0/N,
			  compute_accuracy);
     else
	  skip_benchmark("doesn't work for N <= 2");
}

void do_sciport_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     typedef void sp_fft_proc(int *init, int *is, int *n,
			      FFTW_COMPLEX *x, FFTW_COMPLEX *work,
			      FFTW_COMPLEX *y);
     typedef void (*sp_fft_proc_ptr)(int *init, int *is, int *n,
				     FFTW_COMPLEX *x, FFTW_COMPLEX *work,
				     FFTW_COMPLEX *y);
     sp_fft_proc_ptr sp_fft;
     extern sp_fft_proc
	  FORTRANIZE(spscfft2,SPSCFFT2), FORTRANIZE(spdcfft2,SPDCFFT2);

     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("SCIPORT");
     if (!FFT_OK) return;

     if (sizeof(FFTW_REAL) == sizeof(double))
	  sp_fft = FORTRANIZE(spdcfft2,SPDCFFT2);
     else
	  sp_fft = FORTRANIZE(spscfft2,SPSCFFT2);

     if (N == 0 || N >= 4) {
	  int init = 1, is1 = -1, is2 = +1;

	  if (N != 0) 
	       sp_fft(&init,&is1,&N,arr,work,arr);

	  init = 0;

	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  sp_fft(&init,&is1,&N,arr,work,arr),
			  1.0, -1, 1,
			  sp_fft(&init,&is2,&N,arr,work,arr),
			  1.0/N,
			  compute_accuracy);	  
     }
     else if (N < 4)
	  skip_benchmark("can't handle N < 4");
#endif
}

void do_singleton_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int n1 = N, n2 = N, n3 = N, is1 = -2, is2 = +2;

	  FFT_NAME("Singleton");

	  if (check_prime_factors(N,23))
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(singletonfft,
					  SINGLETONFFT)((FFTW_REAL*)arr,
							(FFTW_REAL*)arr + 1, 
							&N, &N, &N, &is1),
			       1.0, -1, 1,
			       FORTRANIZE(singletonfft,
					  SINGLETONFFT)((FFTW_REAL*)arr,
							(FFTW_REAL*)arr + 1, 
							&N, &N, &N, &is2),
			       1.0/N,
			       compute_accuracy);
	  else
	       skip_benchmark("can't handle prime factors > 23");
     }
#endif
}

void do_singleton_f2c_fft(int rank, int *n, int *n_rev, int N, 
		short is_power_of_two,
                FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
                int size_arr, int size_work,
                short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 1) return;

     FFT_NAME("Singleton (f2c)");
     if (check_prime_factors(N,23))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
				 N, N, N, -2),
			  1.0, -1, 1,
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr + 1, 
				 N, N, N, +2),
			  1.0/N,
			  compute_accuracy);     
     else
	  skip_benchmark("can't handle prime factors > 23");
}

void do_sorensen_ffts(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
if (sizeof(FFTW_REAL) == sizeof(double)) {
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Sorensen");
     if (!FFT_OK) return;

     if (N != 1) {
	  int m = 0;
	  int *itab = 0;
	  
	  if (N != 0) {
	       int len_itab;

	       while ((1 << m) < N) ++m;

	       len_itab = sqrt(2.0*N) + 1.0;
	       itab = (int *) fftw_malloc(sizeof(int) * len_itab);
	       if (!itab) {
		    printf("\nERROR! Out of memory for Sorensen!\n");
		    exit(1);
	       }
	  }

	  {
	       int len = 0;
	       FFTW_REAL *w = (FFTW_REAL*)work;

	       if (N != 0) {
		    len = (N + 7) / 8;

		    FORTRANIZE(tinit,TINIT)(&m,w,w+len,
					    w+2*len,w+3*len,itab);
	       }

	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(ctfftsr,CTFFTSR)((FFTW_REAL*)arr,
							   (FFTW_REAL*)arr + N,
							   &m,w,w+len,
							   w+2*len,w+3*len,
							   itab),
			       1.0, -1, 0,
			       0,
			       1.0/N,
			       -compute_accuracy);
	  }

	  if (N != 0)
	       fftw_free(itab);
     }
     else if (N == 1)
	  skip_benchmark("can't handle N == 1");
}
#endif
}

void do_sorensen_dit_fft(int rank, int *n, int *n_rev, int N, 
			 short is_power_of_two,
			 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
			 int size_arr, int size_work,
			 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
if (sizeof(FFTW_REAL) == sizeof(double)) {
     if (rank != 1) return;

     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("Sorensen DIT");
     if (!FFT_OK) return;

     {
	  int n1 = N, m = 0, is1 = -1, is2 = +1;
	  FFTW_REAL  *u = (FFTW_REAL *) (work + N);
	  
	  if (N != 0) {
	       while (n1 % 2 == 0) {
		    ++m;
		    n1 /= 2;
	       }
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(sfftfu,SFFTFU)
			       ((FFTW_REAL*)arr,(FFTW_REAL*)arr + N,
				&N,&m,&is2),
			  1.0, -1, 0,
			  FORTRANIZE(sfftfu,SFFTFU)
			       ((FFTW_REAL*)arr,(FFTW_REAL*)arr + N,
				&N,&m,&is1),
			  1.0,
			  compute_accuracy);	  
     }
}
#endif
}

void do_valkenburg_fft(int rank, int *n, int *n_rev, int N, 
		       short is_power_of_two,
		       FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
     extern void W_deallocate(void);
     if (rank != 1) return;
     FFT_NAME("Valkenburg");
     DO_BENCHMARK_ND(rank, n, N, arr, work,
		     rft(arr, N, work),
		     1.0, +1, 1,
		     fft(work, N, arr),
		     1.0,
		     compute_accuracy);
     W_deallocate();
}

void do_scilib_1d_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBSCI
     if (rank != 1) return;
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int isign, isign2=+1, n1, isys[1] = {0};
	  FFTW_REAL scale = 1.0;
	  FFTW_REAL *work2 = ((FFTW_REAL*)work) + size_work;
	  
	  FFT_NAME("SCILIB");
     
	  if (N != 0) {
	       n1 = N;
	       isign = 0;
	       CCFFT(&isign, &n1, &scale, arr, arr, 
		     work2, (FFTW_REAL*)work, 
		     isys);
	       isign = -1;
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  CCFFT(&isign, &n1, &scale, arr, arr,
				work2,
				(FFTW_REAL*)work,
				isys),
			  1.0, -1, 1,
			  CCFFT(&isign2, &n1, &scale, arr, arr,
				work2,
				(FFTW_REAL*)work,
				isys),
			       1.0/N,
			  compute_accuracy);
     }
#endif
}

void do_essl_1d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBESSL
     int n2;
	
     if (rank != 1) return;

     FFT_NAME("ESSL");

     if (N > 0) {
	  n2 = N;
	  while (n2 % 2 == 0) n2 /= 2;
     }
     if (N == 0 || (N >= 6 && N % 2 == 0 && n2 < 10)) {
	  FFTW_REAL scale = 1.0;
	  essl_fft_proc *ESSL_FFT;
	  int n1;
	  int isign, d1, d2, d3, d4, d5, naux1, naux2, init;
	  FFTW_REAL *work2 = ((FFTW_REAL*)work) + size_work;
	  
	  n1 = N;
	  d1 = 1;
	  d2 = 0;
	  d3 = 1;
	  d4 = 0;
	  d5 = 1;
	  isign = -1;
	  naux1 = naux2 = size_work;
	  
	  if (sizeof(double) == sizeof(FFTW_REAL))
	       ESSL_FFT = dcft;
	  else
	       ESSL_FFT = scft;
	  
	  if (N != 0) {
	       init = 1;
	       ESSL_FFT(&init, arr, &d1, &d2, arr, 
			&d3, &d4, &n1, &d5, &isign, 
			&scale, (FFTW_REAL*)work, 
			&naux1, work2, &naux2);
	  }
	  init = 0;
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  ESSL_FFT(&init, arr, &d1, &d2, arr,
				   &d3, &d4, &n1, &d5, &isign,
				   &scale, (FFTW_REAL*)work, &naux1,
				   work2, &naux2),
			  1.0, +1, 1,
			  (isign = 1, init = 1,
			   ESSL_FFT(&init, arr, &d1, &d2, arr,
				    &d3, &d4, &n1, &d5, &isign,
				    &scale, (FFTW_REAL*)work, &naux1,
					      work2, &naux2),
			   init = 0,
			   ESSL_FFT(&init, arr, &d1, &d2, arr,
				    &d3, &d4, &n1, &d5, &isign,
				    &scale, (FFTW_REAL*)work, &naux1,
				    work2, &naux2)),
			  1.0/N,
			  compute_accuracy);
	  
     }
     else
	  skip_benchmark("only works for N = x * 2^m, "
			 "where x < 10 and m >= 1");
#endif
}

void do_sunperf_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBSUNPERF
     typedef void ffti_proc(int,FFTW_COMPLEX *);
     typedef void fftfb_proc(int,FFTW_COMPLEX *,FFTW_COMPLEX *);
     typedef void (*ffti_proc_ptr)(int,FFTW_COMPLEX *);
     typedef void (*fftfb_proc_ptr)(int,FFTW_COMPLEX *,
				    FFTW_COMPLEX *);
     ffti_proc_ptr sunperf_ffti;
     fftfb_proc_ptr sunperf_fftf, sunperf_fftb;
     extern ffti_proc zffti,cffti;
     extern fftfb_proc zfftf, zfftb, cfftf, cfftb;
 
     if (rank != 1) return;
    
     FFT_NAME("SUNPERF");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  sunperf_ffti = zffti;
	  sunperf_fftf = zfftf;
	  sunperf_fftb = zfftb;
     }
     else {
	  sunperf_ffti = cffti;
	  sunperf_fftf = cfftf;
	  sunperf_fftb = cfftb;
     }
     
     if (N != 0) sunperf_ffti(N, work);
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     sunperf_fftf(N, arr, work),
		     1.0, -1, 1,
		     sunperf_fftb(N, arr, work),
		     1.0/N,
		     compute_accuracy);     	  
#endif
}


void do_sgimath_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBCOMPLIB_SGIMATH
     if (rank != 1) return;
    
     FFT_NAME("SGIMATH");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  FFTW_COMPLEX *zfft1di( int n, FFTW_COMPLEX *save);
	  int zfft1d(int sign, int n, FFTW_COMPLEX *array, 
		     int inc, FFTW_COMPLEX *save);
	  if (N != 0) zfft1di(N, work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  zfft1d(-1, N, arr, 1, work),
			  1.0, -1, 1,
			  zfft1d(1, N, arr, 1, work),
			  1.0/N,
			  compute_accuracy);
     } else {
	  FFTW_COMPLEX *cfft1di(int n, FFTW_COMPLEX *save);
	  int cfft1d(int sign, int n, FFTW_COMPLEX *array, 
		     int inc, FFTW_COMPLEX *save);
	  if (N != 0) cfft1di(N, work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  cfft1d(-1, N, arr, 1, work),
			  1.0, -1, 1,
			  cfft1d(1, N, arr, 1, work),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}

void do_scsl_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBSCS
     typedef void scsl_fft_proc(int *isign, int *n, FFTW_REAL *scale,
			   FFTW_COMPLEX *x, FFTW_COMPLEX *y,
			   FFTW_REAL *table, FFTW_REAL *work, 
			   int *isys);     
     typedef void (*scsl_fft_proc_ptr)(int *isign, int *n, FFTW_REAL *scale,
				       FFTW_COMPLEX *x, FFTW_COMPLEX *y,
				       FFTW_REAL *table, FFTW_REAL *work, 
				       int *isys);
     extern scsl_fft_proc
	  FORTRANIZE(ccfft,CCFFT), FORTRANIZE(zzfft,ZZFFT);
     scsl_fft_proc_ptr scsl_fft;
     int isign=-1, isign2=+1, n1 = N, isys = 0;
     FFTW_REAL scale = 1.0;
     FFTW_REAL *table, *work2;

     if (rank != 1) return;

     FFT_NAME("SCSL");

     table = (FFTW_REAL*)work;
     work2 = table + 2*N + 30;

     if (sizeof(FFTW_REAL) == sizeof(double))
	  scsl_fft = FORTRANIZE(zzfft,ZZFFT);
     else
	  scsl_fft = FORTRANIZE(ccfft,CCFFT);

     if (N != 0) {
	  n1 = N;
	  isign = 0;
	  scsl_fft(&isign, &n1, &scale, arr, arr, 
		   table, work2, &isys);
	  isign = -1;
     }
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     scsl_fft(&isign, &n1, &scale, arr, arr, 
			      table, work2, &isys),
		     1.0, -1, 1,
		     scsl_fft(&isign2, &n1, &scale, arr, arr, 
			      table, work2, &isys),
		     1.0/N,
		     compute_accuracy);
#endif
}

void do_asci_red_fft(int rank, int *n, int *n_rev, int N,
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_ASCI_RED_FFT
     int is0 = 0, is1 = -1, is2 = +1;
     if (rank != 1) return;

     FFT_NAME("ASCI Red PPro FFT");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  void zfft1d_(FFTW_COMPLEX*, int*, int*, FFTW_COMPLEX*);
          if (N != 0) zfft1d_(arr, &N, &is0, work);
          DO_BENCHMARK_ND(rank, n, N, arr, arr,
                          zfft1d_(arr, &N, &is1, work),
                          1.0, -1, 1,
                          zfft1d_(arr, &N, &is2, work),
                          1.0/N,
                          compute_accuracy);
     } else {
	  void cfft1d_(FFTW_COMPLEX*, int*, int*, FFTW_COMPLEX*);
          if (N != 0) cfft1d_(arr, &N, &is0, work);
          DO_BENCHMARK_ND(rank, n, N, arr, arr,
                          cfft1d_(arr, &N, &is1, work),
                          1.0, -1, 1,
                          cfft1d_(arr, &N, &is2, work),
                          1.0/N,
                          compute_accuracy);
     }
#endif
}


void do_imsl_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBIMSL
     typedef void ffti_proc(int *,FFTW_COMPLEX *);
     typedef void fftfb_proc(int *,FFTW_COMPLEX *,FFTW_COMPLEX *,
			     FFTW_COMPLEX *,FFTW_COMPLEX *);
     typedef void (*ffti_proc_ptr)(int *,FFTW_COMPLEX *);
     typedef void (*fftfb_proc_ptr)(int *,FFTW_COMPLEX *,FFTW_COMPLEX *,
				    FFTW_COMPLEX *,FFTW_COMPLEX *);
     ffti_proc_ptr imsl_ffti;
     fftfb_proc_ptr imsl_fftf, imsl_fftb;
     extern ffti_proc FORTRANIZE(fftci,FFTCI), FORTRANIZE(dfftci,DFFTCI);
     extern fftfb_proc 
	  FORTRANIZE(f2tcf,F2TCF), FORTRANIZE(df2tcf,DF2TCF),
	  FORTRANIZE(f2tcb,F2TCB), FORTRANIZE(df2tcb,DF2TCB);

     if (rank != 1) return;
    
     FFT_NAME("IMSL");

     if (sizeof(float) == sizeof(FFTW_REAL)) {
	  imsl_ffti = FORTRANIZE(fftci,FFTCI);
	  imsl_fftf = FORTRANIZE(f2tcf,F2TCF);
	  imsl_fftb = FORTRANIZE(f2tcb,F2TCB);
     }
     else {
	  imsl_ffti = FORTRANIZE(dfftci,DFFTCI);
	  imsl_fftf = FORTRANIZE(df2tcf,DF2TCF);
	  imsl_fftb = FORTRANIZE(df2tcb,DF2TCB);
     }
     
     if (N != 0) imsl_ffti(&N, work + N);
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     imsl_fftf(&N, arr, arr, work + N, work),
		     1.0, -1, 1,
		     imsl_fftb(&N, arr, arr, work + N, work),
		     1.0/N,
		     compute_accuracy);     	  
#endif
}

void do_nag_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBNAG
     typedef void fftfb_proc(FFTW_REAL *r, FFTW_REAL *i, int *n,
			     FFTW_REAL *work, int *fail);
     typedef void (*fftfb_proc_ptr)(FFTW_REAL *r, FFTW_REAL *i, int *n,
				    FFTW_REAL *work, int *fail);
     fftfb_proc_ptr nag_fftf;
     int fail = 0; /* MUST be initialized to 0 for NAG */
     FFTW_REAL *re,*im;
     extern fftfb_proc 
	  FORTRANIZE(c06fcf,C06FCF), FORTRANIZE(c06fce,C06FCE);

     if (rank != 1) return;
    
     FFT_NAME("NAG");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  nag_fftf = FORTRANIZE(c06fcf,C06FCF);
     }
     else {
	  nag_fftf = FORTRANIZE(c06fce,C06FCE);
     }

     re = (FFTW_REAL *)arr;
     im = re + N;

     /* Note: NAG has no initialization or inverse routines. */
     
     if (N != 0) {
	  /* Call once to check for failures: */
	  nag_fftf(re,im,&N,(FFTW_REAL*)work,&fail);
     }
     if (fail == 0) {
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  nag_fftf(re,im,&N,(FFTW_REAL*)work,&fail),
			  sqrt(N), -1, 0,
			  0,
			  1.0/N,
			  -compute_accuracy);     	  
     }
     else {
	  if (fail == 1)
	       skip_benchmark("can't handle factors > 19")
	  else if (fail == 2)
	       skip_benchmark("can't handle more than 20 prime factors")
	  else if (fail == 3) 
	       skip_benchmark("can't handle N <= 1")
	  else
	       skip_benchmark("Unknown error in NAG")
     }
#endif
}

#ifdef HAVE_LIBDXML
#include <dxmldef.h>
#endif

void do_dxml_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBDXML
     if (rank != 1) return;
    
     FFT_NAME("DXML");

     if (sizeof(float) == sizeof(FFTW_REAL)) {
           int is_stride_1 = 1, stride = 1;
           DXML_C_FFT_STRUCTURE fft_struct;

	   if (N != 0) cfft_init_(&N,&fft_struct,&is_stride_1);
	   
	   DO_BENCHMARK_ND(rank, n, N, arr, arr,
                     cfft_apply_("C","C","F",arr,arr,&fft_struct,&stride),
                     1.0, -1, 1,
                     cfft_apply_("C","C","B",arr,arr,&fft_struct,&stride),
                     1.0/N,
                     compute_accuracy);

	   if (N != 0) cfft_exit_(&fft_struct);
     }
     else if (sizeof(double) == sizeof(FFTW_REAL)) {
           int is_stride_1 = 1, stride = 1;
           DXML_Z_FFT_STRUCTURE fft_struct;

	   if (N != 0) zfft_init_(&N,&fft_struct,&is_stride_1);
	   
	   DO_BENCHMARK_ND(rank, n, N, arr, arr,
                     zfft_apply_("C","C","F",arr,arr,&fft_struct,&stride),
                     1.0, -1, 1,
                     zfft_apply_("C","C","B",arr,arr,&fft_struct,&stride),
                     1.0/N,
                     compute_accuracy);

	   if (N != 0) zfft_exit_(&fft_struct);
     }
#endif
}

/**************************** 3D FFTs: *******************************/

extern int maxn(int n, int nums[]);
extern int max2n(int n, int nums[]);

short do_fftwnd_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
     fftwnd_plan p = 0, p_inv = 0;
     int wis_flags = 0;

     #ifdef FFTW_HAS_WISDOM /* if we are using a version of FFTW with the
			       "wisdom" feature, then take advantage of it */
     wis_flags = FFTW_USE_WISDOM;
     #endif
     
     FFT_NAME("FFTW");

     if (fft_data_cur && N != 0) {
#ifdef FFTW_HAS_PLAN_SPECIFIC
	  p = fftwnd_create_plan_specific(rank,n,FFTW_FORWARD,
					  FFTW_MEASURE|FFTW_IN_PLACE|wis_flags,
					  arr,1,arr,1);
	  p_inv = fftwnd_create_plan_specific(rank,n,FFTW_BACKWARD,
					      FFTW_ESTIMATE|FFTW_IN_PLACE,
					      arr,1,arr,1);
#else
	  p = fftwnd_create_plan(rank,n,FFTW_FORWARD,
				 FFTW_MEASURE|FFTW_IN_PLACE|wis_flags);
	  p_inv = fftwnd_create_plan(rank,n,FFTW_BACKWARD,
				     FFTW_ESTIMATE|FFTW_IN_PLACE);
#endif
	  if (!p || !p_inv) {
	       log_printf("\n\nError creating FFTWND plan!\n");
	       printf("\n\nError creating FFTWND plan!\n");
	       return 1;
	  }
#ifdef FFTWND_HAS_PRINT_PLAN
	  if (bench_log_file) {
	       log_printf("\nFFTWND ");
	       fftwnd_fprint_plan(bench_log_file,p);
	       log_printf("\n");
	  }
#endif
     }
     
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     fftwnd(p,1,arr,1,0,arr,1,0),
		     1.0, FFTW_FORWARD, 1,
		     fftwnd(p_inv,1,arr,1,0,arr,1,0), 1.0/N,
		     compute_accuracy);
     
     if (fft_data_cur && N != 0) {
	  fftwnd_destroy_plan(p);
	  fftwnd_destroy_plan(p_inv);
     }
     return 0;
}

void do_temperton_3d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
     int okay;
     
     if (rank != 3) return;

     FFT_NAME("Temperton (f2c)");
     if (check_prime_factors(N,5) && 
	 (N == 0 || 0 == gpf3d(arr, n_rev[0], n_rev, +1)))
	  okay = 1;
     else
	  okay = 0;
     /* N.B. The +1/-1 parameter in GPF3D means the opposite
	of the usual (it is the negative of the sign of the
	exponent in the transform). */
     if (okay)
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  gpf3d(arr, n_rev[0], n_rev, +1),
			  1.0, -1, 1,
			  gpf3d(arr, n_rev[0], n_rev, -1),
			  1.0/N,
			  compute_accuracy);
     else if (check_prime_factors(N,5) && maxn(3,n) > 256)
	  skip_benchmark("can't handle dimensions > 256");
     else 
	  skip_benchmark("only handles N = 2^m 3^n 5^q");
}

void do_temperton_f_3d_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int np = 0, is1=+1,is2=-1;
	  
	  if (rank != 3) return;

	  FFT_NAME("Temperton");

	  if (N != 0) {
	       /* insure that array size is power of 2, 3, & 5 only: */
	       if (check_prime_factors(N,5) && maxn(3,n) <= 256) {
		    np = 0;
		    FORTRANIZE(gpfafgpf3d,GPFAFGPF3D)
			 (arr, &n_rev[0], n_rev, &is1);
	       }
	       else
		    np = 1;
	  }
	  /* N.B. The +1/-1 parameter in GPF3D means the opposite
	     of the usual (it is the negative of the sign of the
	     exponent in the transform). */
	  if (np == 0)
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(gpfafgpf3d,GPFAFGPF3D)
			            (arr, &n_rev[0], n_rev, &is1),
			       1.0, -1, 1,
			       FORTRANIZE(gpfafgpf3d,GPFAFGPF3D)
			            (arr, &n_rev[0], n_rev, &is2),
			       1.0/N,
			       compute_accuracy);
	  else if (np == 1 && maxn(3,n) > 256)
	       skip_benchmark("can't handle dimensions > 256");
	  else 
	       skip_benchmark("only handles N = 2^m 3^n 5^q");
     }
#endif
}

void do_mfft_3d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_MFFT
#ifdef HAVE_F77
     /* Auggh!  MFFT, because of a totally non-portable hack that
	it uses, requires that the integer and floating point types
	be the same size!  Damn these Fortran programmers! */
     if (sizeof(FFTW_REAL) == sizeof(float) &&
	 sizeof(FFTW_REAL) == sizeof(int) /* <-- obscenity */) {

	  int np = 0, is0=0,is1=-1,is2=+1;
	  int *wl=0,*wm=0,*wn=0,*iwork=0;
	  int iopt=0,iord=1,ierr;
	  
	  if (rank != 3) return;

	  FFT_NAME("MFFT");
	  if (!FFT_OK) return;
	  if (N != 0) {
	       /* insure that array size is power of 2, 3, & 5 only: */
	       if (check_prime_factors(N,5)) {
		    wl = (int*) fftw_malloc(sizeof(int)*(4*n_rev[0]+14));
		    wm = (int*) fftw_malloc(sizeof(int)*(4*n_rev[1]+14));
		    wn = (int*) fftw_malloc(sizeof(int)*(4*n_rev[2]+14));
		    iwork = (int*) fftw_malloc(sizeof(int)*maxn(3,n_rev));
		    if (wl == 0 || wm == 0 || wn == 0 || iwork == 0) {
			 printf("\nERROR!  Out of memory for MFFT!\n");
			 exit(1);
		    }

		    /* Initialize work arrays: */
		    FORTRANIZE(mfftc3fft,MFFTC3FFT)
			 (arr,&n_rev[0],&n_rev[0],&n_rev[1],&n_rev[2],
			  wl,wm,wn,&iopt,&is0,&iord,iwork,&ierr);

		    np = ierr;
	       }
	       else
		    np = 1;
	  }
	  if (np == 0)
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       FORTRANIZE(mfftc3fft,MFFTC3FFT)
			       (arr,&n_rev[0],&n_rev[0],&n_rev[1],&n_rev[2],
				wl,wm,wn,&iopt,&is1,&iord,iwork,&ierr),
			       1.0, -1, 1,
			       FORTRANIZE(mfftc3fft,MFFTC3FFT)
			       (arr,&n_rev[0],&n_rev[0],&n_rev[1],&n_rev[2],
				wl,wm,wn,&iopt,&is2,&iord,iwork,&ierr),
			       1.0/N,
			       compute_accuracy);
	  else 
	       skip_benchmark("only handles N = 2^m 3^n 5^q");

	  if (N == 0) {
	       if (wl) fftw_free(wl);
	       if (wm) fftw_free(wm);
	       if (wn) fftw_free(wn);
	       if (iwork) fftw_free(iwork);
	  }
     }
#endif
#endif
}

void do_harm_fft(int rank, int *n, int *n_rev, int N, 
		 short is_power_of_two,
		 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		 int size_arr, int size_work,
		 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
if (sizeof(FFTW_REAL) == sizeof(double)) {
     if (rank != 3) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("HARM");
     if (!FFT_OK) return;
     if (N == 0 || (n[0]>4 && n[1]>4 && n[2]>4)) {
	  int m[3];
	  int ifset, ifset2 = -2, iferr, *inv = 0;
	  
	  if (N != 0) {
	       int i;
	       for (i = 0; i < 3; ++i) {
		    int lg = 0,n1 = n_rev[i];
		    while (n1 % 2 == 0) { ++lg; n1 /= 2; }
		    m[i] = lg;
	       }
	       inv = (int*) fftw_malloc(sizeof(int)*maxn(3,n));
	       if (!inv) {
		    printf("\n\nERROR!  Out of memory!\n");
		    if (bench_log_file)
			 fprintf(bench_log_file,
				 "\n\nERROR!  Out of memory!\n");
		    exit(1);
	       }
	       ifset = 0;
	       FORTRANIZE(harm,HARM)(arr, m, inv, work, &ifset, &iferr);
	       ifset = +2;
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(harm,HARM)
			       (arr, m, inv, work, &ifset, &iferr),
			  1.0, +1, 1,
			  FORTRANIZE(harm,HARM)
			       (arr, m, inv, work, &ifset2, &iferr),
			  1.0,
			  compute_accuracy);
	  if (N != 0)
	       fftw_free(inv);
     }
     else
	  skip_benchmark("all dimensions must be > 4 for HARM");
}
#endif
}

void do_harm_f2c_fft(int rank, int *n, int *n_rev, int N, 
		     short is_power_of_two,
		     FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		     int size_arr, int size_work,
		     short compute_accuracy, factor_type allowed_factors)
{
     if (rank != 3) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("HARM (f2c)");
     if (!FFT_OK) return;
     if (N == 0 || (n[0]>4 && n[1]>4 && n[2]>4)) {
	  int m[3];
	  int ifset, ifset2 = -2, iferr, *inv = 0;
	  
	  if (N != 0) {
	       int i;
	       for (i = 0; i < 3; ++i) {
		    int lg = 0,n1 = n_rev[i];
		    while (n1 % 2 == 0) { ++lg; n1 /= 2; }
		    m[i] = lg;
	       }
	       inv = (int*) fftw_malloc(sizeof(int)*maxn(3,n));
	       if (!inv) {
		    printf("\n\nERROR!  Out of memory!\n");
		    if (bench_log_file)
			 fprintf(bench_log_file,
				 "\n\nERROR!  Out of memory!\n");
		    exit(1);
	       }
	       ifset = 0;
	       harmd(arr, m, inv, work, &ifset, &iferr);
	       ifset = +2;
	  }
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  harmd(arr, m, inv, work, &ifset, &iferr),
			  1.0, +1, 1,
			  harmd(arr, m, inv, work, &ifset2, &iferr),
                               1.0,
			  compute_accuracy);
	  if (N != 0)
	       fftw_free(inv);
     }
     else
	  skip_benchmark("all dimensions must be > 4 for HARM");
}

void do_nrc_fourn_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NRC
     if (rank != 2) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (C)");
     if (!FFT_OK) return;
     {
	  FFTW_REAL *arr2 = (FFTW_REAL*)arr - 1; /* 1-based! */
	  unsigned int n1[4];
	  n1[1] = n[0]; n1[2] = n[1]; n1[3] = n[2]; /* 1-based! */
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  nrc_fourn(arr2, n1, rank, -1),
			  1.0, -1, 1,
			  nrc_fourn(arr2, n1, rank, +1),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}

void do_nrf_fourn_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_NRF
#ifdef HAVE_F77
     if (sizeof(FFTW_REAL) != sizeof(double)) return;
     FFT_REQUIRE_POWER_OF_TWO;
     FFT_NAME("NR (F)");
     if (!FFT_OK) return;
     {
	  int is1 = -1, is2 = +1;
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(nrffourn,NRF_FOURN)
			  (arr, n_rev, &rank, &is1),
			  1.0, -1, 1,
			  FORTRANIZE(nrffourn,NRF_FOURN)
			  (arr, n_rev, &rank, &is2),
			  1.0/N,
			  compute_accuracy);
     }
#endif
#endif
}

void do_pda_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     int result;
     
     FFT_NAME("PDA");

     if (sizeof(FFTW_REAL) == sizeof(double))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(pdadnfftf,PDADNFFTF)(&rank, n_rev, 
							  (FFTW_REAL*)arr, 
							  (FFTW_REAL*)arr + N, 
							  work, &result),
			  sqrt(1.0*N), -1, 0,
			  FORTRANIZE(pdadnfftb,PDADNFFTB)(&rank, n_rev, 
							  (FFTW_REAL*)arr, 
							  (FFTW_REAL*)arr + N, 
							  work, &result),
			  1.0,
			  compute_accuracy);
     else if (sizeof(FFTW_REAL) == sizeof(float))
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  FORTRANIZE(pdanfftf,PDANFFTF)(&rank, n_rev, 
							(FFTW_REAL*)arr, 
							(FFTW_REAL*)arr + N, 
							work, &result),
			  sqrt(1.0*N), -1, 0,
			  FORTRANIZE(pdanfftb,PDANFFTB)(&rank, n_rev, 
							(FFTW_REAL*)arr, 
							(FFTW_REAL*)arr + N, 
							work, &result),
			  1.0,
			  compute_accuracy);
#endif
}

void do_pda_f2c_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
     int result;
     
     FFT_NAME("PDA (f2c)");
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     pda_f2c_dnfftf(&rank, n_rev, 
				(FFTW_REAL*)arr, 
				(FFTW_REAL*)arr + N, 
				work, &result),
		     sqrt(1.0*N), -1, 0,
		     pda_f2c_dnfftb(&rank, n_rev, 
				(FFTW_REAL*)arr, 
				(FFTW_REAL*)arr + N, 
				work, &result),
		     1.0,
		     compute_accuracy);
}

void do_singleton_3d_fft(int rank, int *n, int *n_rev, int N, 
			 short is_power_of_two,
			 FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
			 int size_arr, int size_work,
			 short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_F77
     int N1 = N, n1, n2, n3, prod2, is1 = -2, is2 = +2;
     
     if (rank != 3) return;
     if (sizeof(FFTW_REAL) != sizeof(double)) return;

     n1 = n[2];
     n2 = n[1];
     n3 = n[0];
     
     prod2 = n1 * n2;
     
     FFT_NAME("Singleton");
     if (check_prime_factors(N,23)) {
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  {
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n1, &n1, &is1);
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n2, &prod2, &is1);
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n3, &N1, &is1);
			  },
			  1.0, -1, 1,
			  {
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n1, &n1, &is2);
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n2, &prod2, &is2);
			       FORTRANIZE(singletonfft,SINGLETONFFT)
				    ((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				     &N1, &n3, &N1, &is2);
			  },
			  1.0/N,
			  compute_accuracy);     
     }
     else
	  skip_benchmark("can't handle prime factors > 23");
#endif
}


void do_singleton_3d_f2c_fft(int rank, int *n, int *n_rev, int N, 
			     short is_power_of_two,
			     FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
			     int size_arr, int size_work,
			     short compute_accuracy, 
			     factor_type allowed_factors)
{
     int n1, n2, n3, prod2;
     
     if (rank != 3) return;

     FFT_NAME("Singleton (f2c)");

     n1 = n[2];
     n2 = n[1];
     n3 = n[0];
     
     prod2 = n1 * n2;
     
	  if (check_prime_factors(N,23)) {
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     {
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n1, n1, -2);
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n2, prod2, -2);
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n3, N, -2);
		     },
		     1.0, -1, 1,
		     {
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n1, n1, +2);
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n2, prod2, +2);
			  go_fft((FFTW_REAL*)arr, (FFTW_REAL*)arr+1, 
				 N, n3, N, +2);
		     },
		     1.0/N,
		     compute_accuracy);     
          }
          else
               skip_benchmark("can't handle prime factors > 23");
}

void do_essl_3d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBESSL
     /* I only have the single-precision version of this routine: */
     if (sizeof(FFTW_REAL) == sizeof(float)) {
	  int ok = 1;
	  FFT_NAME("ESSL");
	  if (N != 0) {
	       int i;
	       for (i = 0; i < 3; ++i) {
		    int n1 = n[i];
		    while (n1 % 2 == 0) n1 /= 2;
		    if (n1 > 9)
			 ok = 0;
		    if (n[i] % 2 != 0 || n[i] < 6)
			 ok = 0;
	       }
	  }
	  if (ok) {
	       int inc2x, inc3x, n1, n2, n3, isign, is2 = +1, naux;
	       FFTW_REAL scale = 1.0;
	       
	       naux = size_work;
	       n1 = n[2];
	       n2 = n[1];
	       n3 = n[0];
	       inc2x = n1;
	       inc3x = n1 * n2;
	       isign = -1;
	       
	       DO_BENCHMARK_ND(rank, n, N, arr, arr,
			       scft3(arr, &inc2x, &inc3x, arr, 
				     &inc2x, &inc3x, 
				     &n1, &n2, &n3, &isign, 
				     &scale, work, &naux),
			       1.0, +1, 1,
			       scft3(arr, &inc2x, &inc3x, arr, 
				     &inc2x, &inc3x, 
				     &n1, &n2, &n3, &is2, 
				     &scale, work, &naux),
			       1.0/N,
			       compute_accuracy);
	  }
	  else
	       skip_benchmark("only works for N = x * 2^m, "
			      "where x < 10 and m >= 1");
     }
#endif
}

void do_scilib_3d_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#if defined(HAVE_LIBSCI) && !defined(HAVE_LIBSCS)
     if (sizeof(FFTW_REAL) == sizeof(double)) {
	  int isign, isign2 = +1,n1, n2, n3, ldx, ldy, ldx2, ldy2;
	  FFTW_REAL scale = 1.0;
	  FFTW_REAL *cray_fft_work, *cray_fft_table;
	  int cray_fft_isys[4];
	  
	  FFT_NAME("SCILIB");

	  ldx = ldy = n1 = n_rev[0];
	  ldx2 = ldy2 = n2 = n_rev[1];
	  n3 = n_rev[2];
	  
#ifdef _CRAYMPP
	  if (N != 0) {
	       int i;
	       for (i = 0; i < 3; ++i) {
		    cray_fft_isys[1+i] = !check_prime_factors(n_rev[i],5);
	       }
	       cray_fft_isys[0] = 3;
	       cray_fft_work = (FFTW_REAL*)work;
	       cray_fft_table = cray_fft_work + 2*(N);
	  }
#else
	  cray_fft_isys[0] = 1;
	  cray_fft_table = (FFTW_REAL*)work;
	  cray_fft_work = cray_fft_table + 100 + 2*(n[0]+n[1]+n[2]);
#endif
	  
	  if (N != 0) {
	       isign = 0;  /*
			    * tells FFT we are initializing */	       
	       CCFFT3D(&isign, &n1, &n2, &n3, &scale,
		       arr, &ldx, &ldx2, arr, &ldy, &ldy2,
		       cray_fft_table, cray_fft_work, cray_fft_isys);
	  }
	  isign = -1;
	  scale = 1.0;
	  
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  CCFFT3D(&isign, &n1, &n2, &n3, &scale,
				  arr, &ldx, &ldx2, arr, &ldy, &ldy2,
				  cray_fft_table, cray_fft_work, 
				  cray_fft_isys),
			  1.0, -1, 1,
			  CCFFT3D(&isign2, &n1, &n2, &n3, &scale,
				  arr, &ldx, &ldx2, arr, &ldy, &ldy2,
				  cray_fft_table, cray_fft_work, 
				  cray_fft_isys),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}

void do_scsl_3d_fft(int rank, int *n, int *n_rev, int N, 
		      short is_power_of_two,
		      FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		      int size_arr, int size_work,
		      short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBSCS
     typedef void scsl_fft_proc(int *,int*,int*,int*,FFTW_REAL*,
				FFTW_COMPLEX*,int*,int*,
				FFTW_COMPLEX*,int*,int*,
				FFTW_REAL*,FFTW_REAL*,int*);
     typedef void (*scsl_fft_proc_ptr)(int *,int*,int*,int*,FFTW_REAL*,
				       FFTW_COMPLEX*,int*,int*,
				       FFTW_COMPLEX*,int*,int*,
				       FFTW_REAL*,FFTW_REAL*,int*);
     scsl_fft_proc_ptr scsl_fft;
     extern scsl_fft_proc
	  FORTRANIZE(ccfft3d,CCFFT3D), FORTRANIZE(zzfft3d,ZZFFT3D);
     int isign, isign2 = +1, n1, n2, n3, ldx, ldy, ldx2, ldy2;
     FFTW_REAL scale = 1.0;
     FFTW_REAL *cray_fft_work, *cray_fft_table;
     int cray_fft_isys[1];
	  
     FFT_NAME("SCSL");

     if (sizeof(FFTW_REAL) == sizeof(double))
	  scsl_fft = FORTRANIZE(zzfft3d,ZZFFT3D);
     else
	  scsl_fft = FORTRANIZE(ccfft3d,CCFFT3D);

     ldx = ldy = n1 = n_rev[0];
     ldx2 = ldy2 = n2 = n_rev[1];
     n3 = n_rev[2];
     
     cray_fft_isys[0] = 0;
     cray_fft_table = (FFTW_REAL*)work;
     cray_fft_work = cray_fft_table + (30 + 2 * n1) + (30 + 2 * n2) +
	                              (30 + 2 * n3);
     
     if (N != 0) {
	  isign = 0;  /* tells FFT we are initializing */	       
	  scsl_fft(&isign, &n1, &n2, &n3, &scale,
		   arr, &ldx, &ldx2, arr, &ldy, &ldy2,
		   cray_fft_table, cray_fft_work, cray_fft_isys);
     }
     isign = -1;
     
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     scsl_fft(&isign, &n1, &n2, &n3, &scale,
			      arr, &ldx, &ldx2, arr, &ldy, &ldy2,
			      cray_fft_table, cray_fft_work, 
			      cray_fft_isys),
		     1.0, -1, 1,
		     scsl_fft(&isign2, &n1, &n2, &n3, &scale,
			      arr, &ldx, &ldx2, arr, &ldy, &ldy2,
			      cray_fft_table, cray_fft_work, 
			      cray_fft_isys),
		     1.0/N,
		     compute_accuracy);
#endif
}

void do_imsl_3d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBIMSL
     typedef void ffti_proc(int *,FFTW_COMPLEX *);
     typedef void fftfb_proc(int *,int *, int *,
			     FFTW_COMPLEX *,
			     int *, int *,
			     FFTW_COMPLEX *,
			     int *, int *,
			     FFTW_COMPLEX *,
			     FFTW_COMPLEX *,
			     FFTW_COMPLEX *,
			     FFTW_COMPLEX *);
     typedef void (*ffti_proc_ptr)(int *,FFTW_COMPLEX *);
     typedef void (*fftfb_proc_ptr)(int *,int *, int *,
				    FFTW_COMPLEX *,
				    int *, int *,
				    FFTW_COMPLEX *,
				    int *, int *,
				    FFTW_COMPLEX *,
				    FFTW_COMPLEX *,
				    FFTW_COMPLEX *,
				    FFTW_COMPLEX *);
     ffti_proc_ptr imsl_ffti;
     fftfb_proc_ptr imsl_fftf, imsl_fftb;
     extern ffti_proc FORTRANIZE(fftci,FFTCI), FORTRANIZE(dfftci,DFFTCI);
     extern fftfb_proc 
	  FORTRANIZE(f2t3f,F2T3F), FORTRANIZE(df2t3f,DF2T3F),
	  FORTRANIZE(f2t3b,F2T3B), FORTRANIZE(df2t3b,DF2T3B);
     extern int maxn(int n, int nums[]);
     FFTW_COMPLEX *w1,*w2,*w3;

     if (rank != 3) return;
    
     FFT_NAME("IMSL");

     w1 = work + maxn(rank,n);
     w2 = w1 + 2*n_rev[0] + 8;
     w2 = w2 + 2*n_rev[1] + 8;

     if (sizeof(float) == sizeof(FFTW_REAL))  {
	  imsl_ffti = FORTRANIZE(fftci,FFTCI);
	  imsl_fftf = FORTRANIZE(f2t3f,F2T3F);
	  imsl_fftb = FORTRANIZE(f2t3b,F2T3B);
     }
     else {
	  imsl_ffti = FORTRANIZE(dfftci,DFFTCI);
	  imsl_fftf = FORTRANIZE(df2t3f,DF2T3F);
	  imsl_fftb = FORTRANIZE(df2t3b,DF2T3B);
     }
     
     if (N != 0) {
	  imsl_ffti(&n_rev[0], w1);
	  imsl_ffti(&n_rev[1], w2);
	  imsl_ffti(&n_rev[2], w3);
     }
     DO_BENCHMARK_ND(rank, n, N, arr, arr,
		     imsl_fftf(&n_rev[0],&n_rev[1],&n_rev[2],
			       arr, &n_rev[0],&n_rev[1],
			       arr, &n_rev[0],&n_rev[1],
			       w1,w2,w3,work),
		     1.0, -1, 1,
		     imsl_fftb(&n_rev[0],&n_rev[1],&n_rev[2],
			       arr, &n_rev[0],&n_rev[1],
			       arr, &n_rev[0],&n_rev[1],
			       w1,w2,w3,work),
		     1.0/N,
		     compute_accuracy);     	  
#endif
}

void do_nag_3d_fft(int rank, int *n, int *n_rev, int N, 
		   short is_power_of_two,
		   FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		   int size_arr, int size_work,
		   short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBNAG
     typedef void fftfb_proc(int *ndim, int *n, int *n_tot,
			     FFTW_REAL *re, FFTW_REAL *im,
			     FFTW_REAL *work, int *lwork, int *fail);
     typedef void (*fftfb_proc_ptr)(int *ndim, int *n, int *n_tot,
				    FFTW_REAL *re, FFTW_REAL *im,
				    FFTW_REAL *work, int *lwork, int *fail);
     fftfb_proc_ptr nag_fftf;
     int fail = 0; /* MUST be initialized to 0 for NAG */
     FFTW_REAL *re,*im;
     int lwork = size_work / 2;
     extern fftfb_proc 
	  FORTRANIZE(c06fjf,C06FJF), FORTRANIZE(c06fje,C06FJE);

     if (rank != 3) return;
   
     FFT_NAME("NAG");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  nag_fftf = FORTRANIZE(c06fjf,C06FJF);
     }
     else {
	  nag_fftf = FORTRANIZE(c06fje,C06FJE);
     }

     re = (FFTW_REAL *)arr;
     im = re + N;

     /* Note: NAG has no initialization or inverse routines. */
     
     if (N != 0) {
	  /* Call once to check for failures: */
	  nag_fftf(&rank,n_rev,&N,re,im,
		   (FFTW_REAL*)work,&lwork,&fail);
     }
     if (fail == 0) {
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  nag_fftf(&rank,n_rev,&N,re,im,
				   (FFTW_REAL*)work,&lwork,&fail),
			  sqrt(N), -1, 0,
			  0,
			  1.0/N,
			  -compute_accuracy);     	  
     }
     else {
	  if (fail == 1)
	       skip_benchmark("rank < 1")
	  else if (fail == 2)
	       skip_benchmark("N != product of dimensions")
	  else if (fail > 10 && fail % 10 == 1) 
               skip_benchmark("NAG can't handle factors > 19")
	  else if (fail > 10 && fail % 10 == 2)
               skip_benchmark("NAG can't handle more than 20 prime factors")
	  else if (fail > 10 && fail % 10 == 3)
	       skip_benchmark("NAG can't handle N <= 1")
	  else if (fail > 10 && fail % 10 == 4)
	       skip_benchmark("NAG requires work size >= 3 * max(n)")
	  else
	       skip_benchmark("Unknown error in NAG")
     }
#endif
}

void do_dxml_3d_fft(int rank, int *n, int *n_rev, int N, 
		    short is_power_of_two,
		    FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		    int size_arr, int size_work,
		    short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBDXML
     if (rank != 3) return;
    
     FFT_NAME("DXML");

     if (sizeof(float) == sizeof(FFTW_REAL)) {
           int is_stride_1 = 1, stride = 1;
           DXML_C_FFT_STRUCTURE_3D fft_struct;

	   if (N != 0) cfft_init_3d_(&n_rev[0],&n_rev[1],&n_rev[2],
				     &fft_struct,&is_stride_1);
	   
	   DO_BENCHMARK_ND(rank, n, N, arr, arr,
                     cfft_apply_3d_("C","C","F",arr,arr,&n_rev[0],&n_rev[1],
				    &fft_struct,&stride,&stride,&stride),
                     1.0, -1, 1,
                     cfft_apply_3d_("C","C","B",arr,arr,&n_rev[0],&n_rev[1],
				    &fft_struct,&stride,&stride,&stride),
                     1.0/N,
                     compute_accuracy);

	   if (N != 0) cfft_exit_3d_(&fft_struct);
     }
     else if (sizeof(double) == sizeof(FFTW_REAL)) {
           int is_stride_1 = 1, stride = 1;
           DXML_Z_FFT_STRUCTURE_3D fft_struct;

	   if (N != 0) zfft_init_3d_(&n_rev[0],&n_rev[1],&n_rev[2],
				     &fft_struct,&is_stride_1);
	   
	   DO_BENCHMARK_ND(rank, n, N, arr, arr,
                     zfft_apply_3d_("C","C","F",arr,arr,&n_rev[0],&n_rev[1],
				    &fft_struct,&stride,&stride,&stride),
                     1.0, -1, 1,
                     zfft_apply_3d_("C","C","B",arr,arr,&n_rev[0],&n_rev[1],
				    &fft_struct,&stride,&stride,&stride),
                     1.0/N,
                     compute_accuracy);

	   if (N != 0) zfft_exit_3d_(&fft_struct);
     }
#endif
}

void do_sgimath_3d_fft(int rank, int *n, int *n_rev, int N, 
		       short is_power_of_two,
		       FFTW_COMPLEX *arr, FFTW_COMPLEX *work,
		       int size_arr, int size_work,
		       short compute_accuracy, factor_type allowed_factors)
{
#ifdef HAVE_LIBCOMPLIB_SGIMATH
     if (rank != 3) return;
    
     FFT_NAME("SGIMATH");

     if (sizeof(double) == sizeof(FFTW_REAL)) {
	  FFTW_COMPLEX *zfft3di( int n1, int n2, int n3, FFTW_COMPLEX *save);
	  int zfft3d(int sign, int n1, int n2, int n3,
		     FFTW_COMPLEX *array, int la1, int la2,
		     FFTW_COMPLEX *save);
	  if (N != 0) zfft3di(n_rev[0], n_rev[1], n_rev[2], work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  zfft3d(-1, n_rev[0], n_rev[1], n_rev[2],
				 arr, n_rev[0], n_rev[1], work),
			  1.0, -1, 1,
			  zfft3d(+1, n_rev[0], n_rev[1], n_rev[2],
				 arr, n_rev[0], n_rev[1], work),
			  1.0/N,
			  compute_accuracy);
     } else {
	  FFTW_COMPLEX *cfft3di( int n1, int n2, int n3, FFTW_COMPLEX *save);
	  int cfft3d(int sign, int n1, int n2, int n3,
		     FFTW_COMPLEX *array, int la1, int la2,
		     FFTW_COMPLEX *save);
	  if (N != 0) cfft3di(n_rev[0], n_rev[1], n_rev[2], work);
	  DO_BENCHMARK_ND(rank, n, N, arr, arr,
			  cfft3d(-1, n_rev[0], n_rev[1], n_rev[2],
				 arr, n_rev[0], n_rev[1], work),
			  1.0, -1, 1,
			  cfft3d(+1, n_rev[0], n_rev[1], n_rev[2],
				 arr, n_rev[0], n_rev[1], work),
			  1.0/N,
			  compute_accuracy);
     }
#endif
}

