AskOverflow.Dev

AskOverflow.Dev Logo AskOverflow.Dev Logo

AskOverflow.Dev Navigation

  • 主页
  • 系统&网络
  • Ubuntu
  • Unix
  • DBA
  • Computer
  • Coding
  • LangChain

Mobile menu

Close
  • 主页
  • 系统&网络
    • 最新
    • 热门
    • 标签
  • Ubuntu
    • 最新
    • 热门
    • 标签
  • Unix
    • 最新
    • 标签
  • DBA
    • 最新
    • 标签
  • Computer
    • 最新
    • 标签
  • Coding
    • 最新
    • 标签
主页 / coding / 问题 / 79574267
Accepted
OptimusPrime
OptimusPrime
Asked: 2025-04-15 09:59:41 +0800 CST2025-04-15 09:59:41 +0800 CST 2025-04-15 09:59:41 +0800 CST

使用 cufftPlanMany API 在每个维度上实现 1D FFT,在 Cuda 中执行 4D FFT 的正确方法是什么

  • 772

Cuda 没有 4D FFT 的直接实现。因此,我想将 4D FFT 分解为 4 个 1D FFT,分别对应 X、Y、Z 和 W 维度。我理解 cufftPlanMany API 最适合此用途,因为它无需使用任何 for 循环,因此速度更快。

我为此编写了一个程序。但是,4D FFT 的最终结果与 4D FFTW 实现不匹配。

以下是分别使用 FFTW 和 Cuda 库的两种实现。我仔细选择了 cufftPlanMany 函数的批次、步长和分布。但是,我不知道自己哪里做错了。如有任何帮助,我将不胜感激。

FFTW 4D实现

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <fftw3.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

void printf_fftw_cmplx_array(fftw_complex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i][0], complex_array[i][1]);
    }
}

float run_test_fftw_4d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    fftw_complex *complex_data;
    fftw_plan plan;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(fftw_complex) * element_size;

    clock_t start, stop;
    float elapsed_time;

    // Allocate memory for input and output arrays
    complex_data = (fftw_complex *)fftw_malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i][0] = rand() / (float)RAND_MAX;
        complex_data[i][1] = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Setup the FFT plan
    plan = fftw_plan_dft(4, (int[]){nx, ny, nz, nw}, complex_data, complex_data, FFTW_FORWARD, FFTW_ESTIMATE);

    // Start time
    start = clock();
    
    // Execute the FFT
    fftw_execute(plan);

    // End time
    stop = clock();

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_fftw_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    elapsed_time = (double)(stop - start) / CLOCKS_PER_SEC;

    // Clean up
    fftw_destroy_plan(plan);
    fftw_free(complex_data);
    fftw_cleanup();

    return elapsed_time;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_fftw_4d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    return 0;
}

错误的 cuFFT4D 实现

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h> 
#include <cufft.h>
#include <math.h>

#define PRINT_FLAG 1
#define NPRINTS 5  // print size

#define CHECK_CUDA(call)                                                       \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult error;                                                         \
    if ( (error = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                __LINE__);                                                     \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
}

void printf_cufft_cmplx_array(cufftComplex *complex_array, unsigned int size) {
    for (unsigned int i = 0; i < NPRINTS; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
    printf("...\n");
    for (unsigned int i = size - NPRINTS; i < size; ++i) {
        printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
    }
}

float run_test_cufft_4d_4x1d(unsigned int nx, unsigned int ny, unsigned int nz, unsigned int nw) {
    srand(2025);

    // Declaration
    cufftComplex *complex_data;
    cufftComplex *d_complex_data;
    cufftHandle plan1d_x, plan1d_y, plan1d_z, plan1d_w;

    unsigned int element_size = nx * ny * nz * nw;
    size_t size = sizeof(cufftComplex) * element_size;

    cudaEvent_t start, stop;
    float elapsed_time;

    // Allocate memory for the variables on the host
    complex_data = (cufftComplex *)malloc(size);

    // Initialize input complex signal
    for (unsigned int i = 0; i < element_size; ++i) {
        complex_data[i].x = rand() / (float)RAND_MAX;
        complex_data[i].y = 0;
    }

    // Print input stuff
    if (PRINT_FLAG) {
        printf("Complex data...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Create CUDA events
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));

    // Allocate device memory for complex signal and output frequency
    CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));

    int n[1] = { (int)nx };
    int embed[1] = { (int)nx };
    CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                            embed, ny * nz * nw, 1, // inembed, istride, idist
                            embed, ny * nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, ny * nz * nw));
    n[0] = (int)ny;
    embed[0] = (int)ny;
    CHECK_CUFFT(cufftPlanMany(&plan1d_y, 1, n,       // 1D FFT of size ny
                            embed, nz * nw, 1, // inembed, istride, idist
                            embed, nz * nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * nz * nw));
    n[0] = (int)nz;
    embed[0] = (int)nz;
    CHECK_CUFFT(cufftPlanMany(&plan1d_z, 1, n,       // 1D FFT of size nz
                            embed, nw, 1, // inembed, istride, idist
                            embed, nw, 1, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nw));
    n[0] = (int)nw;
    embed[0] = (int)nw;
    CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                            embed, 1, nw, // inembed, istride, idist
                            embed, 1, nw, // onembed, ostride, odist
                            CUFFT_C2C, nx * ny * nz));

    // Record the start event
    CHECK_CUDA(cudaEventRecord(start, 0));

    // Copy host memory to device
    CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));

    // Perform FFT along each dimension sequentially
    CHECK_CUFFT(cufftExecC2C(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_x));
    CHECK_CUFFT(cufftExecC2C(plan1d_y, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_y));
    CHECK_CUFFT(cufftExecC2C(plan1d_z, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_z));
    CHECK_CUFFT(cufftExecC2C(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
    CHECK_CUFFT(cufftDestroy(plan1d_w));

    // Retrieve the results into host memory
    CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));

    // Record the stop event
    CHECK_CUDA(cudaEventRecord(stop, 0));
    CHECK_CUDA(cudaEventSynchronize(stop));

    // Print output stuff
    if (PRINT_FLAG) {
        printf("Fourier Coefficients...\n");
        printf_cufft_cmplx_array(complex_data, element_size);
    }

    // Compute elapsed time
    CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));

    // Clean up
    CHECK_CUDA(cudaFree(d_complex_data));
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    free(complex_data);

    return elapsed_time * 1e-3;
}


int main(int argc, char **argv) {
    if (argc != 6) {
        printf("Error: This program requires exactly 5 command-line arguments.\n");
        printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
        printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
        printf("       arg4: Number of iterations\n");
        printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
        return -1;
    }

    unsigned int nx = atoi(argv[1]);
    unsigned int ny = atoi(argv[2]);
    unsigned int nz = atoi(argv[3]);
    unsigned int nw = atoi(argv[4]);
    unsigned int niter = atoi(argv[5]);

    float sum = 0.0;
    float span_s = 0.0;
    for (unsigned int i = 0; i < niter; ++i) {
        span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
        if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
        sum += span_s;
    }
    printf("%.6f\n", sum/(float)niter);

    CHECK_CUDA(cudaDeviceReset());
    return 0;
}

尝试两种 4x4x4x4 数组的实现,你会发现只有前几个系数匹配。我知道 FFTW 实现能产生正确的结果,因为我可以用不同的方式得到相同的结果,例如先进行 3D FFT,再进行 1D FFT,或者同时使用 FFTW 和 cuFFT 库进行 2 次 2D FFT。

cuda
  • 1 1 个回答
  • 67 Views

1 个回答

  • Voted
  1. Best Answer
    Robert Crovella
    2025-04-17T03:22:04+08:002025-04-17T03:22:04+08:00

    摘要:直接使用一维变换序列不适用于 CUFFT 高级数据布局。先进行一组 3D 变换,再进行一组 1D 变换是可行的。如果在序列中进行转置,似乎也可以使用一维变换序列。

    长话短说:虽然可以使用CUFFT“高级数据布局”(通过)将二维变换分解为两个一维变换cufftPlanMany,但三维(或更高维)变换无法以这种方式分解,因为(例如)连续 y 维变换之间的步长是可变的(在三维情况下是可变的,在二维情况下则不是)。考虑 4x4x4 的情况,对于前 4 个变换,从批次中第一个变换的起点到下一个变换的起点的距离为 1 个元素。但在前 4 个变换之后,到下一个变换起点的距离大于 1 个元素。这无法解释在一维(批量)变换上使用高级数据布局的情况。(此处也提出了类似的说法。)

    楼主已经提到了这一点,但我还是会在这里重复一遍,并在稍后演示一下,这可以通过先批量执行一组 3D 变换,然后再批量执行一组 1D 变换来实现,无需额外的活动/精力/工作。然而,问题在于如何使用 1D 变换来实现。根据这里建议使用转置,似乎可以通过额外的工作来实现。

    接下来的演示似乎可以证明它有效。我之前没有仔细考虑如何进行四维转置,所以这个核函数只是草草了事,肯定没有经过优化,而且在非立方情况下可能根本不起作用(也就是说,我只在 4 个维度都相同的情况下测试了它。即使那样,测试也相当简单。)我还将展示如何先进行 3D 变换,然后再进行 1D 变换:

    # cat t369.cu
    #include <stdio.h>
    #include <stdlib.h>
    #include <cuda_runtime.h>
    #include <cufft.h>
    #include <math.h>
    
    #define PRINT_FLAG 1
    #define NPRINTS 5  // print size
    
    #define CHECK_CUDA(call)                                                       \
    {                                                                              \
        const cudaError_t error = call;                                            \
        if (error != cudaSuccess)                                                  \
        {                                                                          \
            fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
            fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                    cudaGetErrorString(error));                                    \
            exit(EXIT_FAILURE);                                                    \
        }                                                                          \
    }
    
    #define CHECK_CUFFT(call)                                                      \
    {                                                                              \
        cufftResult error;                                                         \
        if ( (error = (call)) != CUFFT_SUCCESS)                                      \
        {                                                                          \
            fprintf(stderr, "Got CUFFT error %d at %s:%d\n", error, __FILE__,      \
                    __LINE__);                                                     \
            exit(EXIT_FAILURE);                                                    \
        }                                                                          \
    }
    using mt = cufftDoubleComplex;
    #define TT CUFFT_Z2Z
    #define GG cufftExecZ2Z
    
    #define IDX(x,y,z,w,lx,ly,lz) ((w*lx*ly*lz)+(z*lx*ly)+(y*lx)+x)
    
    template <typename T>
    __global__ void k4dt(T *i, T *o, int lx, int ly, int lz, int lw){
    
      int idx = blockDim.x*blockIdx.x+threadIdx.x;
      int idy = blockDim.y*blockIdx.y+threadIdx.y;
      int idz = blockDim.z*blockIdx.z+threadIdx.z;
      if ((idx < lx)&&(idy < ly)&&(idz < lz))
        for (int idw = 0; idw < lw; idw++)
          o[IDX(idy,idz,idw,idx,ly,lz,lw)] = i[IDX(idx,idy,idz,idw,lx,ly,lz)];
    }
    
    
    void printf_cufft_cmplx_array(mt *complex_array, unsigned int size) {
        for (unsigned int i = 0; i < NPRINTS; ++i) {
            printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
        }
        printf("...\n");
        for (unsigned int i = size - NPRINTS; i < size; ++i) {
            printf("  (%2.4f, %2.4fi)\n", complex_array[i].x, complex_array[i].y);
        }
    }
    
    float run_test_cufft_4d_4x1d(int nx, int ny, int nz, int nw) {
        srand(2025);
    
        // Declaration
        mt *complex_data;
        mt *d_complex_data;
        mt *d_transpose_data;
    
        unsigned int element_size = nx * ny * nz * nw;
        size_t size = sizeof(mt) * element_size;
    
        cudaEvent_t start, stop;
        float elapsed_time;
    
        // Allocate memory for the variables on the host
        complex_data = (mt *)malloc(size);
    
        // Initialize input complex signal
        for (unsigned int i = 0; i < element_size; ++i) {
            complex_data[i].x = rand() / (float)RAND_MAX;
            complex_data[i].y = 0;
        }
    
        // Print input stuff
        if (PRINT_FLAG) {
            printf("Complex data...\n");
            printf_cufft_cmplx_array(complex_data, element_size);
        }
    
        // Create CUDA events
        CHECK_CUDA(cudaEventCreate(&start));
        CHECK_CUDA(cudaEventCreate(&stop));
    
        // Allocate device memory for complex signal and output frequency
        CHECK_CUDA(cudaMalloc((void **)&d_complex_data, size));
        CHECK_CUDA(cudaMalloc((void **)&d_transpose_data, size));
        int n[3];
        int embed[1];
    #ifdef USE_1D
        cufftHandle plan1d_x;
        n[0] = { (int)nx };
        embed[0] = { (int)nx };
        CHECK_CUFFT(cufftPlanMany(&plan1d_x, 1, n,       // 1D FFT of size nx
                                embed, 1, nx, // inembed, istride, idist
                                embed, 1, nx, // onembed, ostride, odist
                                TT, ny * nz * nw));
    #else
        cufftHandle plan3d;
        n[0] = nx;
        n[1] = ny;
        n[2] = nz;
        CHECK_CUFFT(cufftPlanMany(&plan3d, 3, n,       // 3D FFT of size nx,ny,nz
                                nullptr, 1, 0, // inembed, istride, idist
                                nullptr, 1, 0, // onembed, ostride, odist
                                TT, nw));
    
        cufftHandle plan1d_w;
        n[0] = (int)nw;
        embed[0] = (int)nw;
        CHECK_CUFFT(cufftPlanMany(&plan1d_w, 1, n,       // 1D FFT of size nw
                                embed, nx*ny*nz, 1, // inembed, istride, idist
                                embed, nx*ny*nz, 1, // onembed, ostride, odist
                                TT, nx * ny * nz));
    #endif
    
    
        // Record the start event
        CHECK_CUDA(cudaEventRecord(start, 0));
    
        // Copy host memory to device
        CHECK_CUDA(cudaMemcpy(d_complex_data, complex_data, size, cudaMemcpyHostToDevice));
    
        // Perform FFT along each dimension sequentially
    #ifdef USE_1D
        printf("using 1D transforms\n");
        dim3 b(8,8,8);
        dim3 g((nx+b.x-1)/b.x, (ny+b.y-1)/b.y, (nz+b.z-1)/b.z);
        CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
        k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
        CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
        k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
        CHECK_CUFFT(GG(plan1d_x, d_complex_data, d_complex_data, CUFFT_FORWARD));
        k4dt<<<g,b>>>(d_complex_data, d_transpose_data, nx, ny, nz, nw);
        CHECK_CUFFT(GG(plan1d_x, d_transpose_data, d_transpose_data, CUFFT_FORWARD));
        k4dt<<<g,b>>>(d_transpose_data, d_complex_data, nx, ny, nz, nw);
        CHECK_CUFFT(cufftDestroy(plan1d_x));
    #else
        printf("using 3D and 1D transforms\n");
        CHECK_CUFFT(GG(plan3d, d_complex_data, d_complex_data, CUFFT_FORWARD));
        CHECK_CUFFT(cufftDestroy(plan3d));
        CHECK_CUFFT(GG(plan1d_w, d_complex_data, d_complex_data, CUFFT_FORWARD));
        CHECK_CUFFT(cufftDestroy(plan1d_w));
    #endif
    
        // Retrieve the results into host memory
        CHECK_CUDA(cudaMemcpy(complex_data, d_complex_data, size, cudaMemcpyDeviceToHost));
    
        // Record the stop event
        CHECK_CUDA(cudaEventRecord(stop, 0));
        CHECK_CUDA(cudaEventSynchronize(stop));
    
        // Print output stuff
        if (PRINT_FLAG) {
            printf("Fourier Coefficients...\n");
            printf_cufft_cmplx_array(complex_data, element_size);
        }
    
        // Compute elapsed time
        CHECK_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
    
        // Clean up
        CHECK_CUDA(cudaFree(d_complex_data));
        CHECK_CUDA(cudaFree(d_transpose_data));
        CHECK_CUDA(cudaEventDestroy(start));
        CHECK_CUDA(cudaEventDestroy(stop));
        free(complex_data);
    
        return elapsed_time * 1e-3;
    }
    
    
    int main(int argc, char **argv) {
        if (argc != 6) {
            printf("Error: This program requires exactly 5 command-line arguments.\n");
            printf("       %s <arg0> <arg1> <arg2> <arg3> <arg4>\n", argv[0]);
            printf("       arg0, arg1, arg2, arg3: FFT lengths in 4D\n");
            printf("       arg4: Number of iterations\n");
            printf("       e.g.: %s 64 64 64 64 5\n", argv[0]);
            return -1;
        }
    
        unsigned int nx = atoi(argv[1]);
        unsigned int ny = atoi(argv[2]);
        unsigned int nz = atoi(argv[3]);
        unsigned int nw = atoi(argv[4]);
        unsigned int niter = atoi(argv[5]);
    
        float sum = 0.0;
        float span_s = 0.0;
        for (unsigned int i = 0; i < niter; ++i) {
            span_s = run_test_cufft_4d_4x1d(nx, ny, nz, nw);
            if (PRINT_FLAG) printf("[%d]: %.6f s\n", i, span_s);
            sum += span_s;
        }
        printf("%.6f\n", sum/(float)niter);
    
        CHECK_CUDA(cudaDeviceReset());
        return 0;
    }
    # nvcc -o t369_cufft t369.cu -lcufft
    # compute-sanitizer ./t369_cufft 4 4 4 4 1
    ========= COMPUTE-SANITIZER
    Complex data...
      (0.2005, 0.0000i)
      (0.4584, 0.0000i)
      (0.8412, 0.0000i)
      (0.6970, 0.0000i)
      (0.3846, 0.0000i)
    ...
      (0.5214, 0.0000i)
      (0.3179, 0.0000i)
      (0.9771, 0.0000i)
      (0.1417, 0.0000i)
      (0.5867, 0.0000i)
    using 3D and 1D transforms
    Fourier Coefficients...
      (121.0454, 0.0000i)
      (-1.6709, -1.3923i)
      (-12.7056, 0.0000i)
      (-1.6709, 1.3923i)
      (-1.3997, -3.1249i)
    ...
      (1.0800, 0.8837i)
      (2.0585, -2.7097i)
      (1.1019, 1.7167i)
      (4.9727, 0.1244i)
      (-1.2561, 0.6645i)
    [0]: 0.283668 s
    0.283668
    ========= ERROR SUMMARY: 0 errors
    # nvcc -o t369_cufft t369.cu -lcufft -DUSE_1D
    # compute-sanitizer ./t369_cufft 4 4 4 4 1
    ========= COMPUTE-SANITIZER
    Complex data...
      (0.2005, 0.0000i)
      (0.4584, 0.0000i)
      (0.8412, 0.0000i)
      (0.6970, 0.0000i)
      (0.3846, 0.0000i)
    ...
      (0.5214, 0.0000i)
      (0.3179, 0.0000i)
      (0.9771, 0.0000i)
      (0.1417, 0.0000i)
      (0.5867, 0.0000i)
    using 1D transforms
    Fourier Coefficients...
      (121.0454, 0.0000i)
      (-1.6709, -1.3923i)
      (-12.7056, 0.0000i)
      (-1.6709, 1.3923i)
      (-1.3997, -3.1249i)
    ...
      (1.0800, 0.8837i)
      (2.0585, -2.7097i)
      (1.1019, 1.7167i)
      (4.9727, 0.1244i)
      (-1.2561, 0.6645i)
    [0]: 0.285792 s
    0.285792
    ========= ERROR SUMMARY: 0 errors
    #
    

    当我针对 4 4 4 4 1 情况(4x4x4x4 数组)运行 OP 的 FFTW 代码(我没有在这里重复)时,我得到如下输出:

    # ./t369_fftw 4 4 4 4 1
    Complex data...
      (0.2005, 0.0000i)
      (0.4584, 0.0000i)
      (0.8412, 0.0000i)
      (0.6970, 0.0000i)
      (0.3846, 0.0000i)
    ...
      (0.5214, 0.0000i)
      (0.3179, 0.0000i)
      (0.9771, 0.0000i)
      (0.1417, 0.0000i)
      (0.5867, 0.0000i)
    Fourier Coefficients...
      (121.0454, 0.0000i)
      (-1.6709, -1.3923i)
      (-12.7056, 0.0000i)
      (-1.6709, 1.3923i)
      (-1.3997, -3.1249i)
    ...
      (1.0800, 0.8837i)
      (2.0585, -2.7097i)
      (1.1019, 1.7167i)
      (4.9727, 0.1244i)
      (-1.2561, 0.6645i)
    [0]: 0.000018 s
    0.000018
    #
    

    因此,就输出所描绘的内容而言,事情似乎是匹配的。

    笔记:

    • 在一维情况下,我们只需要使用 x 维度的变换。转置操作会将下一个必要的维度带入 x 维度,以便进行下一次变换。

    • 在比较 FFTW 和 CUFFT 时,通常重要的是确保两者使用相同的基本类型。楼主的代码没有反映这一点;我修改了我的演示程序,使其至少在我的系统上与 的cufftDoubleComplex精度 ( double)相匹配。fftw_complex

    • CUDA 12.2 仅进行了少量测试。特别是,转置运算并未经过仔细测试,也未进行任何优化。

    • 2

相关问题

  • CUB reduce_by_key

  • 关于简单非共享任务的 CUDA 性能问题

  • 将“cudaMalloc”和“cudaMemcpy”分开在不同的函数中?

  • 流和 asyncEngine 计数如何在 CUDA 中协同工作

  • 为什么他们只填充一个共享内存?

Sidebar

Stats

  • 问题 205573
  • 回答 270741
  • 最佳答案 135370
  • 用户 68524
  • 热门
  • 回答
  • Marko Smith

    重新格式化数字,在固定位置插入分隔符

    • 6 个回答
  • Marko Smith

    为什么 C++20 概念会导致循环约束错误,而老式的 SFINAE 不会?

    • 2 个回答
  • Marko Smith

    VScode 自动卸载扩展的问题(Material 主题)

    • 2 个回答
  • Marko Smith

    Vue 3:创建时出错“预期标识符但发现‘导入’”[重复]

    • 1 个回答
  • Marko Smith

    具有指定基础类型但没有枚举器的“枚举类”的用途是什么?

    • 1 个回答
  • Marko Smith

    如何修复未手动导入的模块的 MODULE_NOT_FOUND 错误?

    • 6 个回答
  • Marko Smith

    `(表达式,左值) = 右值` 在 C 或 C++ 中是有效的赋值吗?为什么有些编译器会接受/拒绝它?

    • 3 个回答
  • Marko Smith

    在 C++ 中,一个不执行任何操作的空程序需要 204KB 的堆,但在 C 中则不需要

    • 1 个回答
  • Marko Smith

    PowerBI 目前与 BigQuery 不兼容:Simba 驱动程序与 Windows 更新有关

    • 2 个回答
  • Marko Smith

    AdMob:MobileAds.initialize() - 对于某些设备,“java.lang.Integer 无法转换为 java.lang.String”

    • 1 个回答
  • Martin Hope
    Fantastic Mr Fox msvc std::vector 实现中仅不接受可复制类型 2025-04-23 06:40:49 +0800 CST
  • Martin Hope
    Howard Hinnant 使用 chrono 查找下一个工作日 2025-04-21 08:30:25 +0800 CST
  • Martin Hope
    Fedor 构造函数的成员初始化程序可以包含另一个成员的初始化吗? 2025-04-15 01:01:44 +0800 CST
  • Martin Hope
    Petr Filipský 为什么 C++20 概念会导致循环约束错误,而老式的 SFINAE 不会? 2025-03-23 21:39:40 +0800 CST
  • Martin Hope
    Catskul C++20 是否进行了更改,允许从已知绑定数组“type(&)[N]”转换为未知绑定数组“type(&)[]”? 2025-03-04 06:57:53 +0800 CST
  • Martin Hope
    Stefan Pochmann 为什么 {2,3,10} 和 {x,3,10} (x=2) 的顺序不同? 2025-01-13 23:24:07 +0800 CST
  • Martin Hope
    Chad Feller 在 5.2 版中,bash 条件语句中的 [[ .. ]] 中的分号现在是可选的吗? 2024-10-21 05:50:33 +0800 CST
  • Martin Hope
    Wrench 为什么双破折号 (--) 会导致此 MariaDB 子句评估为 true? 2024-05-05 13:37:20 +0800 CST
  • Martin Hope
    Waket Zheng 为什么 `dict(id=1, **{'id': 2})` 有时会引发 `KeyError: 'id'` 而不是 TypeError? 2024-05-04 14:19:19 +0800 CST
  • Martin Hope
    user924 AdMob:MobileAds.initialize() - 对于某些设备,“java.lang.Integer 无法转换为 java.lang.String” 2024-03-20 03:12:31 +0800 CST

热门标签

python javascript c++ c# java typescript sql reactjs html

Explore

  • 主页
  • 问题
    • 最新
    • 热门
  • 标签
  • 帮助

Footer

AskOverflow.Dev

关于我们

  • 关于我们
  • 联系我们

Legal Stuff

  • Privacy Policy

Language

  • Pt
  • Server
  • Unix

© 2023 AskOverflow.DEV All Rights Reserve