本文档为第零届北京大学高性能计算综合能力竞赛讲座内容,演示代码托管在GitHub。
可以在官网页面查询到openmp的历史版本和发布日期
| 日期 | 版本 |
|---|---|
| October 1997 | Fortran 1.0 |
| October 1998 | C/C++ 1.0 |
| March 2002 | C/C++ 2.0 |
| May 2005 | OpenMP 2.5 |
| May 2008 | OpenMP 3.0 |
| July 2011 | OpenMP 3.1 |
| July 2013 | OpenMP 4.0 |
| November 2015 | OpenMP 4.5 |
| November 2018 | OpenMP 5.0 |
| November 2020 | OpenMP 5.1 |
| November 2021 | OpenMP 5.2 |

包含OpenMP library和OpenMP Runtime library



实际的内存模型更加复杂


proc_bind(master|close|spread):控制线程绑定与否,以及线程对于绑定单元(称为 place)分布含在Ubuntu提供的build-essential包中
echo |cpp -fopenmp -dM |grep -i open
# #define _OPENMP 201511
可以直接在编译语句添加-fopenmp,如:
g++ -O2 -std=c++14 -fopenmp hello.cpp -o hello
如果使用cmake构建项目:
find_package(OpenMP)
add_compile_options(-Wunknown-pragmas)
add_executable(hello src/hello.cpp)
target_link_libraries(hello OpenMP::OpenMP_CXX)
gcc加入-Wunknown-pragmas会在编译时报告没有处理的#pragma语句
见src/hello.cpp
#include <iostream>
#include <omp.h>
int main() {
#pragma omp parallel num_threads(8)
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
printf("Hello from thread %d of %d\n", tid, num_threads);
}
return 0;
}
执行结果
Hello from thread 0 of 8
Hello from thread 4 of 8
Hello from thread 5 of 8
Hello from thread 3 of 8
Hello from thread 6 of 8
Hello from thread 1 of 8
Hello from thread 2 of 8
Hello from thread 7 of 8
#pragma omp <directive name> <clause>{}标记作用的代码块设置运行线程数(优先级由低到高):
export OMP_NUM_THREADS=4void omp_set_num_threads(int)num_threads(integer-expression)if从句判断串行还是并行执行一些常用库函数:
// 设置并行区运行的线程数
void omp_set_num_threads(int)
// 获得并行区运行的线程数
int omp_get_num_threads(void)
// 获得线程编号
int omp_get_thread_num(void)
// 获得openmp wall clock时间(单位秒)
double omp_get_wtime(void)
// 获得omp_get_wtime时间精度
double omp_get_wtick(void)
支持的从句
if(scalar_expression):决定是否以并行的方式执行并行区
num_threads(integer_expression):指定并行区的线程数default(shared|none):指定默认变量类型
shared:默认为共享变量none:无默认变量类型,每个变量都需要另外指定shared(list):指定共享变量列表
private(list):指定私有变量列表
firstprivate(list)
private数据从句见src/data_clause.cpp
int cnt;
cnt = 1;
#pragma omp parallel num_threads(4)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
cnt = 1;
#pragma omp parallel num_threads(4) private(cnt)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
cnt = 1;
#pragma omp parallel num_threads(4) firstprivate(cnt)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
执行结果
no clause: 5 9 17 13
private(not init): 4 -187939698 -187939698 -187939698
firstprivate: 5 5 5 5
for构造见src/hello_for.cpp
#pragma omp parallel num_threads(8)
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
#pragma omp for ordered
for (int i = 0; i < num_threads; i++) {
// do something
// #pragma omp ordered
// #pragma omp critical
std::cout << "Hello from thread " << tid << std::endl;
}
}
执行结果
#no synchronization
Hello from thread 0Hello from thread
Hello from thread 4
Hello from thread Hello from thread Hello from thread 7
Hello from thread 1
2
Hello from thread 5
6
3
# ordered
Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7
# critical
Hello from thread 5
Hello from thread 4
Hello from thread 1
Hello from thread 7
Hello from thread 6
Hello from thread 3
Hello from thread 2
Hello from thread 0
在并行区内对for循环进行线程划分,且for循环满足格式要求:
var=lb形式,类型也有限制var relational-opb或者b relational-op var详细参考OpenMP API 4.5 Specification, p53
常常将parallel和for合并为parallel for制导语句
parallel for支持的从句:
| parallel | for | parallel for | |
|---|---|---|---|
| if | √ | √ | |
| num_threads | √ | √ | |
| default | √ | √ | |
| copyin | √ | √ | |
| private | √ | √ | √ |
| firstprivate | √ | √ | √ |
| shared | √ | √ | √ |
| reduction | √ | √ | √ |
| lastprivate | √ | √ | |
| schedule | √ | √ | |
| ordered | √ | √ | |
| collapse | √ | √ | |
| nowait | √ |
lastprivate(list)
privateordered
#pragma omp ordered标记顺序执行代码(搭配使用)collapse(n):应用于n重循环
nowait:取消代码块结束时的栅栏同步(barrier)schedule(type [, chunk]):控制调度方式
static:chunk大小固定(默认n/p)dynamic:动态调度,chunk大小固定(默认为1)guided:chunk大小动态缩减runtime:由系统环境变量OMP_SCHEDULE决定schedule见src/inner_product.cpp
for (int i = 0; i < N; i++) {
for (int j = i; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * A[j * N + k];
}
B[i * N + j] = sum;
B[j * N + i] = sum;
}
}
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < N; i++) {
for (int j = i; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * A[j * N + k];
}
B[i * N + j] = sum;
B[j * N + i] = sum;
}
}
执行结果
# export OMP_SCHEDULE="dynamic"
size: 1024
sequence time: 1.46233
omp time: 0.133192
# export OMP_SCHEDULE="static"
size: 1024
sequence time: 1.47874
omp time: 0.219114
见src/vector_norm.cpp
for (int i = 0; i < N; i++) {
ans_seq += b[i] * b[i];
}
ans_seq = sqrt(ans_seq);
#pragma omp parallel for reduction(+ : ans_omp)
for (int i = 0; i < N; i++) {
ans_omp += b[i] * b[i];
}
ans_omp = sqrt(ans_omp);
#pragma omp parallel for
for (int i = 0; i < N; i++) {
#pragma omp atomic
// #pragma omp critical
ans_omp_sync += b[i] * b[i];
}
ans_omp_sync = sqrt(ans_omp_sync);
执行结果
# atomic
size: 33554432
sequence result: 3344.05
omp result: 3344.05
omp sync result: 3344.05
sequence time: 0.0928805
omp time: 0.0180116
omp sync time: 5.17156
# critical
size: 33554432
sequence result: 3344.2
omp result: 3344.2
omp sync result: 3344.2
sequence time: 0.0929021
omp time: 0.0179938
omp sync time: 7.74378


omp_privomp_priv和omp_in一起顺序进行reduction,写回原变量#pragma omp sections
{
#pragma omp section
code1();
#pragma omp section
code2();
}
#pragma omp barrier:在特定位置进行栅栏同步

#pragma omp single:某段代码单线程执行,带隐式同步(使用nowait去掉)

#pragma omp master:采用主线程执行,无隐式同步

#pragma omp critical:某段代码线程互斥执行

#pragma omp atomic:单个特定格式的语句或语句组中某个变量进行原子操作
见src/matrix_vector.cpp
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x_seq[i] += A[i * N + j] * b[j];
}
}
#pragma omp parallel for
for (int i = 0; i < N; i++) {
double tmp = 0;
for (int j = 0; j < N; j++) {
tmp += A[i * N + j] * b[j];
}
x_omp[i] = tmp;
}
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x_omp_fs[i] += A[i * N + j] * b[j];
}
}
执行结果
size: 16384
sequence time: 1.07896
simple omp time: 0.0938018
false sharing time: 0.110479
size: 16384
sequence time: 1.38333
simple omp time: 0.0958252
false sharing time: 0.115473
size: 16384
sequence time: 1.0359
simple omp time: 0.0973124
false sharing time: 0.129693



aligned用于列出内存对齐的指针safelen用于标记循环展开时的数据依赖编译器也自带向量化功能,例如gcc:
-O3-ffast-math-fivopts-march=native-fopt-info-vec-fopt-info-vec-missed