CUB reduce_by_key

Question

OptimusPrime

Asked: 2025-04-15 09:59:41 +0800 CST2025-04-15 09:59:41 +0800 CST 2025-04-15 09:59:41 +0800 CST

使用 cufftPlanMany API 在每个维度上实现 1D FFT，在 Cuda 中执行 4D FFT 的正确方法是什么

772

Cuda 没有 4D FFT 的直接实现。因此，我想将 4D FFT 分解为 4 个 1D FFT，分别对应 X、Y、Z 和 W 维度。我理解 cufftPlanMany API 最适合此用途，因为它无需使用任何 for 循环，因此速度更快。

我为此编写了一个程序。但是，4D FFT 的最终结果与 4D FFTW 实现不匹配。

以下是分别使用 FFTW 和 Cuda 库的两种实现。我仔细选择了 cufftPlanMany 函数的批次、步长和分布。但是，我不知道自己哪里做错了。如有任何帮助，我将不胜感激。

FFTW 4D实现

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <fftw3.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

void printf_fftw_cmplx_array(fftw_complex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
}

float run_test_fftw_4d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    fftw_complex *complex_data;
    fftw_plan plan;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(fftw_complex) * element_size;

    clock_t start, stop;
    float elapsed_time;

    // Allocate memory for input and output arrays
    complex_data = (fftw_complex *)fftw_malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i][0] = rand() / (float)RAND_MAX;
        complex_data[i][1] = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Setup the FFT plan
    plan = fftw_plan_dft(4, (int[]){nx, ny, nz, nw}, complex_data, complex_data, FFTW_FORWARD, FFTW_ESTIMATE);

    // Start time
    start = clock();
    
    // Execute the FFT
    fftw_execute(plan);

    // End time
    stop = clock();

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    elapsed_time = (double)(stop - start) / CLOCKS_PER_SEC;

    // Clean up
    fftw_destroy_plan(plan);
    fftw_free(complex_data);
    fftw_cleanup();

    return elapsed_time;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_fftw_4d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    return 0;
}

错误的 cuFFT4D 实现

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h> 
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    cufftComplex *complex_data;
    cufftComplex *d_complex_data;
    cufftHandle plan1d_x, plan1d_y, plan1d_z, plan1d_w;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(cufftComplex) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (cufftComplex *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));

    int n[1] = { (int)nx };
    int embed[1] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, ny * nz * nw, 1, // inembed, istride, idist
                            embed, ny * nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, ny * nz * nw));
    n[0] = (int)ny;
    embed[0] = (int)ny;
    CHECK_CUFFT(cufftPlanMany(&plan1d_y, 1, n,       // 1D FFT of size ny
                            embed, nz * nw, 1, // inembed, istride, idist
                            embed, nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * nz * nw));
    n[0] = (int)nz;
    embed[0] = (int)nz;
    CHECK_CUFFT(cufftPlanMany(&plan1d_z, 1, n,       // 1D FFT of size nz
                            embed, nw, 1, // inembed, istride, idist
                            embed, nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nw));
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, 1, nw, // inembed, istride, idist
                            embed, 1, nw, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nz));

    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
    CHECK_CUFFT(cufftExecC2C(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_x));
    CHECK_CUFFT(cufftExecC2C(plan1d_y, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_y));
    CHECK_CUFFT(cufftExecC2C(plan1d_z, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_z));
    CHECK_CUFFT(cufftExecC2C(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}

尝试两种 4x4x4x4 数组的实现，你会发现只有前几个系数匹配。我知道 FFTW 实现能产生正确的结果，因为我可以用不同的方式得到相同的结果，例如先进行 3D FFT，再进行 1D FFT，或者同时使用 FFTW 和 cuFFT 库进行 2 次 2D FFT。

1 个回答

Voted

Robert Crovella · Answer 1 · 2025-04-17T03:22:04+08:00

摘要：直接使用一维变换序列不适用于 CUFFT 高级数据布局。先进行一组 3D 变换，再进行一组 1D 变换是可行的。如果在序列中进行转置，似乎也可以使用一维变换序列。

长话短说：虽然可以使用CUFFT“高级数据布局”（通过）将二维变换分解为两个一维变换cufftPlanMany，但三维（或更高维）变换无法以这种方式分解，因为（例如）连续 y 维变换之间的步长是可变的（在三维情况下是可变的，在二维情况下则不是）。考虑 4x4x4 的情况，对于前 4 个变换，从批次中第一个变换的起点到下一个变换的起点的距离为 1 个元素。但在前 4 个变换之后，到下一个变换起点的距离大于 1 个元素。这无法解释在一维（批量）变换上使用高级数据布局的情况。（此处也提出了类似的说法。）

楼主已经提到了这一点，但我还是会在这里重复一遍，并在稍后演示一下，这可以通过先批量执行一组 3D 变换，然后再批量执行一组 1D 变换来实现，无需额外的活动/精力/工作。然而，问题在于如何使用 1D 变换来实现。根据这里建议使用转置，似乎可以通过额外的工作来实现。

接下来的演示似乎可以证明它有效。我之前没有仔细考虑如何进行四维转置，所以这个核函数只是草草了事，肯定没有经过优化，而且在非立方情况下可能根本不起作用（也就是说，我只在 4 个维度都相同的情况下测试了它。即使那样，测试也相当简单。）我还将展示如何先进行 3D 变换，然后再进行 1D 变换：

# cat t369.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}
using mt = cufftDoubleComplex;
#define TT CUFFT_Z2Z
#define GG cufftExecZ2Z

#define IDX(x,y,z,w,lx,ly,lz) ((w*lx*ly*lz)+(z*lx*ly)+(y*lx)+x)

template <typename T>
__global__ void k4dt(T *i, T *o, int lx, int ly, int lz, int lw){

  int idx = blockDim.x*blockIdx.x+threadIdx.x;
  int idy = blockDim.y*blockIdx.y+threadIdx.y;
  int idz = blockDim.z*blockIdx.z+threadIdx.z;
  if ((idx < lx)&&(idy < ly)&&(idz < lz))
    for (int idw = 0; idw < lw; idw++)
      o[IDX(idy,idz,idw,idx,ly,lz,lw)] = i[IDX(idx,idy,idz,idw,lx,ly,lz)];
}


void printf_cufft_cmplx_array(mt *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(int nx, int ny, int nz, int nw) {
    srand(2025);

    // Declaration
    mt *complex_data;
    mt *d_complex_data;
    mt *d_transpose_data;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(mt) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (mt *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
    CHECK_CUDA(cudaMalloc((void **)&d_transpose_data, size));
    int n[3];
    int embed[1];
#ifdef USE_1D
    cufftHandle plan1d_x;
    n[0] = { (int)nx };
    embed[0] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, 1, nx, // inembed, istride, idist
                            embed, 1, nx, // onembed, ostride, odist
                            TT, ny * nz * nw));
#else
    cufftHandle plan3d;
    n[0] = nx;
    n[1] = ny;
    n[2] = nz;
    CHECK_CUFFT(cufftPlanMany(&plan3d, 3, n,       // 3D FFT of size nx,ny,nz
                            nullptr, 1, 0, // inembed, istride, idist
                            nullptr, 1, 0, // onembed, ostride, odist
                            TT, nw));

    cufftHandle plan1d_w;
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, nx*ny*nz, 1, // inembed, istride, idist
                            embed, nx*ny*nz, 1, // onembed, ostride, odist
                            TT, nx * ny * nz));
#endif


    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
#ifdef USE_1D
    printf("using 1D transforms\n");
    dim3 b(8,8,8);
    dim3 g((nx+b.x-1)/b.x, (ny+b.y-1)/b.y, (nz+b.z-1)/b.z);
    CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
    CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
    k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
    CHECK_CUFFT(cufftDestroy(plan1d_x));
#else
    printf("using 3D and 1D transforms\n");
    CHECK_CUFFT(GG(plan3d, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan3d));
    CHECK_CUFFT(GG(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));
#endif

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaFree(d_transpose_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}
# nvcc -o t369_cufft t369.cu -lcufft
# compute-sanitizer ./t369_cufft 4 4 4 4 1
========= COMPUTE-SANITIZER
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
using 3D and 1D transforms
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.283668 s
0.283668
========= ERROR SUMMARY: 0 errors
# nvcc -o t369_cufft t369.cu -lcufft -DUSE_1D
# compute-sanitizer ./t369_cufft 4 4 4 4 1
========= COMPUTE-SANITIZER
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
using 1D transforms
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.285792 s
0.285792
========= ERROR SUMMARY: 0 errors
#

当我针对 4 4 4 4 1 情况（4x4x4x4 数组）运行 OP 的 FFTW 代码（我没有在这里重复）时，我得到如下输出：

# ./t369_fftw 4 4 4 4 1
Complex data...
  (0.2005, 0.0000i)
  (0.4584, 0.0000i)
  (0.8412, 0.0000i)
  (0.6970, 0.0000i)
  (0.3846, 0.0000i)
...
  (0.5214, 0.0000i)
  (0.3179, 0.0000i)
  (0.9771, 0.0000i)
  (0.1417, 0.0000i)
  (0.5867, 0.0000i)
Fourier Coefficients...
  (121.0454, 0.0000i)
  (-1.6709, -1.3923i)
  (-12.7056, 0.0000i)
  (-1.6709, 1.3923i)
  (-1.3997, -3.1249i)
...
  (1.0800, 0.8837i)
  (2.0585, -2.7097i)
  (1.1019, 1.7167i)
  (4.9727, 0.1244i)
  (-1.2561, 0.6645i)
[0]: 0.000018 s
0.000018
#

因此，就输出所描绘的内容而言，事情似乎是匹配的。

笔记：

在一维情况下，我们只需要使用 x 维度的变换。转置操作会将下一个必要的维度带入 x 维度，以便进行下一次变换。
在比较 FFTW 和 CUFFT 时，通常重要的是确保两者使用相同的基本类型。楼主的代码没有反映这一点；我修改了我的演示程序，使其至少在我的系统上与的cufftDoubleComplex精度 ( double)相匹配。fftw_complex
CUDA 12.2 仅进行了少量测试。特别是，转置运算并未经过仔细测试，也未进行任何优化。

使用 cufftPlanMany API 在每个维度上实现 1D FFT，在 Cuda 中执行 4D FFT 的正确方法是什么

FFTW 4D实现

错误的 cuFFT4D 实现

重新格式化数字，在固定位置插入分隔符

为什么 C++20 概念会导致循环约束错误，而老式的 SFINAE 不会？

VScode 自动卸载扩展的问题（Material 主题）

Vue 3：创建时出错“预期标识符但发现‘导入’”[重复]

具有指定基础类型但没有枚举器的“枚举类”的用途是什么？

如何修复未手动导入的模块的 MODULE_NOT_FOUND 错误？

`(表达式，左值) = 右值` 在 C 或 C++ 中是有效的赋值吗？为什么有些编译器会接受/拒绝它？

在 C++ 中，一个不执行任何操作的空程序需要 204KB 的堆，但在 C 中则不需要

PowerBI 目前与 BigQuery 不兼容：Simba 驱动程序与 Windows 更新有关

AdMob：MobileAds.initialize() - 对于某些设备，“java.lang.Integer 无法转换为 java.lang.String”

使用 cufftPlanMany API 在每个维度上实现 1D FFT，在 Cuda 中执行 4D FFT 的正确方法是什么

FFTW 4D实现

错误的 cuFFT4D 实现

1 个回答

相关问题