smilingbuddha提出的问题 -coding

smilingbuddha

Asked: 2025-03-09 10:04:21 +0800 CST

在 C 语言中使用 OpenMP 中的包容性扫描语法

我想使用 OpenMP 中的包容性扫描操作来实现一个算法。下面是我尝试这样做的描述，但未能获得比温和的加速更多的效果。

包含运算定义如下：对于输入向量，[x1,x2,x3,x4]它输出部分和的序列[x1, x1+x2, x1+x2+x3, x1+x2+x3+x4]。这是一个高度可并行化的操作，表面上看，这已经在 OpenMP 中得到了很好的实现。

我查看了以下参考资料：https://theartofhpc.com/pcse/omp-reduction.html#Scanprefixoperations （手册参考https://www.openmp.org/spec-html/5.0/openmpsu45.html#x68-1940002.9.6现在对我来说似乎太神秘了）

artofhpc 网站说，reduction 子句得到了一个修饰语inscan：

#pragma omp parallel for reduction(inscan,+:sumvar)`

在并行循环的主体中，有一个扫描指令，允许您存储部分结果。对于包含性扫描，reduction 变量在扫描指令之前更新：

  sumvar // update
#pragma omp scan inclusive(sumvar)
  partials[i] = sumvar

我尝试遵循相同的语法，以测量与标准串行缩减相比的性能，结果令人非常失望。我的代码位于文章底部。

在代码中，我只是做了一个简单的测试，考虑一个由区间 [-1,1] 中的随机值组成的 9000 万个非常大的向量，并使用越来越多的线程对其进行扫描并测量加速比。这是我的结果（我在重复运行时得到了相同的答案）。我的笔记本电脑 CPU 有 16 个硬件线程，但总体加速比令人失望，只有 1.36。（我本来期望会有更大的加速比！）

我使用 OpenMP 语法进行缩减的方式有问题吗？

➜  Desktop gcc -fopenmp scantest.c  && ./a.out

NumThreads  Speedup
1        0.458
2        1.173
3        1.424
4        1.686
5        1.635
6        1.501
7        1.522
8        1.499
9        1.455
10       1.416
11       1.395
12       1.393
13       1.352
14       1.336
15       1.353
16       1.357

#include<stdio.h>
#include<omp.h>
#include<math.h>
#include<stdlib.h>
#include<assert.h>

int main(int argc, char** argv){

  int N = 9e7;     // vector size 
  double* x;                   // vector to reduce
  double* partials_s;          // vector to scan into
  double* partials_p;          // vector to scan into
  
  double end, start;           // timing variables
  double sumvar;
  
  int tmax = argc>1? atoi(argv[1]):35;
  int threadcount ;

  // Allocate space for all vectors
  x           = (double*) malloc(sizeof(double)*N);
  partials_s  = (double*) malloc(sizeof(double)*N);
  partials_p  = (double*) malloc(sizeof(double)*N);
    
  // Populate the input vectors
  for(int i=0 ; i<N ; ++i){
    x[i] = -1+2.0*rand()/(double)RAND_MAX;
    partials_s[i] = 0.0;
    partials_p[i] = 0.0;
  }

  //-----------------------------------------
  // Serial inclusive scan
  //-----------------------------------------
  start = omp_get_wtime(); 
  sumvar = 0.0;
  for(int i=0 ; i<N ; ++i){
      sumvar += x[i];
      partials_s[i] = sumvar;
  }
  end = omp_get_wtime();
  double stime = end-start; // Time to run the serial code


  //-----------------------------------------------------------------------------
  // Performance of parallel inclusive scan. Print ratio of serial/parallel time
  //----------------------------------------------------------------------------
  printf("\nNumThreads  Speedup \n");
  for(threadcount=1;threadcount<=tmax;++threadcount){

    start = omp_get_wtime();

    sumvar = 0.0;
    #pragma omp parallel for num_threads(threadcount) reduction(inscan,+:sumvar)
    for(int i=0 ; i<N ; ++i){

      sumvar = sumvar + x[i]; // updating the value of sumvar
      #pragma omp scan inclusive(sumvar)
      partials_p[i] = sumvar;
    }
    end = omp_get_wtime();
    double ptime = end-start;

    printf("%d \t %.3f\n",threadcount,stime/ptime);

  }

  //for(int i=0 ; i<N ; ++i){
  //  printf("%.4f  %.4f\n", partials_s[i], partials_p[i]);
  //}

  // Deallocate
  free(x);
  free(partials_s);
  free(partials_p);
  
  return 0;
}

smilingbuddha

Asked: 2025-03-07 00:14:23 +0800 CST

了解两个函数对整数数组的每个元素进行递增的时间差异

在下面粘贴的代码中，我运行了两个函数f1和，f2它们执行完全相同的工作。取一个数字T和整数数组arr（已初始化为0），然后将T的每个元素增加倍数。因此，在和arr 的末尾，输入应该变成。f1f20,0...0T,T...T

我不明白的是为什么f1运行速度比 10 亿慢得多f2（大约慢 1.76 倍T）。这是我的输出

➜桌面 gcctiming-difference.c && ./a.out 1000000000 16

-----> 运行 f1 <------- 耗时：15.511 秒

-----> 运行 f2 <------- 耗时：8.887 秒%

这里T提供给程序的是argv[1]，数组长度是argv[2]。timing-difference.c文件粘贴在这篇文章的末尾。

基本上，f1是直接增加每个arr[i]，而f2使用临时变量tmp进行增加，然后在完成后将其分配给 arr[i]。

#include<stdio.h>
#include<unistd.h>
#include<stdlib.h>
#include<time.h>

/*Directly increment arr[i] for each i*/
void f1(int T, int* arr, int arrlength){

  printf("\n-----> Running f1 <-------\n");
  clock_t end, start;
  double cpu_time_used;

  // For each element of arr, increment it T times.
  start = clock();
  for(int i = 0 ;  i<arrlength ;++i){
    for (int j=0 ; j<T ; ++j){
      arr[i] += 1;
    }
  }
  end = clock();
  cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; 

  printf("Time taken: %.3f seconds", cpu_time_used);

}

/*increment arr[i] using temporary variable tmp*/
void f2(int T, int* arr, int arrlength){

  printf("\n-----> Running f2 <-------\n");
  clock_t end, start;
  double cpu_time_used;

  // For each element of arr, increment it T times.
  start = clock();
  for(int i = 0 ;  i<arrlength ;++i){

    int tmp=arr[i];
    for (int j=0 ; j<T ; ++j){
      tmp += 1;
    }
    arr[i] = tmp; 
  }

  end = clock();
  cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; 
  printf("Time taken: %.3f seconds", cpu_time_used);

}

void print_arr(int* arr, int arrlength){
  for(int i = 0 ; i<arrlength ; ++i){
    printf("%d  ", arr[i]);
  }
  printf("\n\n");

}

/*Zero initialize array*/
void initialize_array(int* arr, int arrlength){
  for (int i=0 ; i<arrlength ; ++i){
    arr[i] = 0;
  }

}

int main(int argc, char** argv){

  int T         = atoi(argv[1]);
  int arrlength = atoi(argv[2]);
  int arr[arrlength];
  
  initialize_array(arr, arrlength);
  f1(T,arr,arrlength);  // --> why does this run slower than f2?
  //print_arr(arr, arrlength);
    
  printf("\n\n");

  initialize_array(arr, arrlength);
  f2(T,arr,arrlength); 
  //print_arr(arr, arrlength);
  
  return 0;
}

编辑f2：当我第一次和f1稍后调用或者即使我多次运行它时，我都会得到相同的时间测量值

启用 -O2 后，两者的计时均为 0.000。我很好奇 gcc 用于编译的默认设置以及为什么性能差异如此之大。正如 wohlstad 所建议的那样，答案当然必须在汇编中，但不幸的是，我根本无法很好地阅读 x86 汇编 :-( 为了获得充分的理解

在 C 语言中使用 OpenMP 中的包容性扫描语法

了解两个函数对整数数组的每个元素进行递增的时间差异

为什么 C++20 概念会导致循环约束错误，而老式的 SFINAE 不会？

VScode 自动卸载扩展的问题（Material 主题）

Vue 3：创建时出错“预期标识符但发现‘导入’”[重复]

具有指定基础类型但没有枚举器的“枚举类”的用途是什么？

如何修复未手动导入的模块的 MODULE_NOT_FOUND 错误？

`(表达式，左值) = 右值` 在 C 或 C++ 中是有效的赋值吗？为什么有些编译器会接受/拒绝它？

何时应使用 std::inplace_vector 而不是 std::vector？

在 C++ 中，一个不执行任何操作的空程序需要 204KB 的堆，但在 C 中则不需要

PowerBI 目前与 BigQuery 不兼容：Simba 驱动程序与 Windows 更新有关

AdMob：MobileAds.initialize() - 对于某些设备，“java.lang.Integer 无法转换为 java.lang.String”

smilingbuddha's questions