本文档为第零届北京大学高性能计算综合能力竞赛讲座内容,演示代码托管在GitHub。
可以在官网页面查询到openmp的历史版本和发布日期
日期 | 版本 |
---|---|
October 1997 | Fortran 1.0 |
October 1998 | C/C++ 1.0 |
March 2002 | C/C++ 2.0 |
May 2005 | OpenMP 2.5 |
May 2008 | OpenMP 3.0 |
July 2011 | OpenMP 3.1 |
July 2013 | OpenMP 4.0 |
November 2015 | OpenMP 4.5 |
November 2018 | OpenMP 5.0 |
November 2020 | OpenMP 5.1 |
November 2021 | OpenMP 5.2 |
包含OpenMP library和OpenMP Runtime library
实际的内存模型更加复杂
proc_bind(master|close|spread)
:控制线程绑定与否,以及线程对于绑定单元(称为 place)分布含在Ubuntu提供的build-essential包中
echo |cpp -fopenmp -dM |grep -i open
# #define _OPENMP 201511
可以直接在编译语句添加-fopenmp
,如:
g++ -O2 -std=c++14 -fopenmp hello.cpp -o hello
如果使用cmake构建项目:
find_package(OpenMP)
add_compile_options(-Wunknown-pragmas)
add_executable(hello src/hello.cpp)
target_link_libraries(hello OpenMP::OpenMP_CXX)
gcc加入-Wunknown-pragmas
会在编译时报告没有处理的#pragma
语句
见src/hello.cpp
#include <iostream>
#include <omp.h>
int main() {
#pragma omp parallel num_threads(8)
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
printf("Hello from thread %d of %d\n", tid, num_threads);
}
return 0;
}
执行结果
Hello from thread 0 of 8
Hello from thread 4 of 8
Hello from thread 5 of 8
Hello from thread 3 of 8
Hello from thread 6 of 8
Hello from thread 1 of 8
Hello from thread 2 of 8
Hello from thread 7 of 8
#pragma omp <directive name> <clause>
{}
标记作用的代码块设置运行线程数(优先级由低到高):
export OMP_NUM_THREADS=4
void omp_set_num_threads(int)
num_threads(integer-expression)
if
从句判断串行还是并行执行一些常用库函数:
// 设置并行区运行的线程数
void omp_set_num_threads(int)
// 获得并行区运行的线程数
int omp_get_num_threads(void)
// 获得线程编号
int omp_get_thread_num(void)
// 获得openmp wall clock时间(单位秒)
double omp_get_wtime(void)
// 获得omp_get_wtime时间精度
double omp_get_wtick(void)
支持的从句
if(scalar_expression)
:决定是否以并行的方式执行并行区
num_threads(integer_expression)
:指定并行区的线程数default(shared|none)
:指定默认变量类型
shared
:默认为共享变量none
:无默认变量类型,每个变量都需要另外指定shared(list)
:指定共享变量列表
private(list)
:指定私有变量列表
firstprivate(list)
private
数据从句见src/data_clause.cpp
int cnt;
cnt = 1;
#pragma omp parallel num_threads(4)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
cnt = 1;
#pragma omp parallel num_threads(4) private(cnt)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
cnt = 1;
#pragma omp parallel num_threads(4) firstprivate(cnt)
{
int tid = omp_get_thread_num();
for (int i = 0; i < 4; i++) {
cnt += 1;
}
results[tid] = cnt;
}
执行结果
no clause: 5 9 17 13
private(not init): 4 -187939698 -187939698 -187939698
firstprivate: 5 5 5 5
for构造见src/hello_for.cpp
#pragma omp parallel num_threads(8)
{
int tid = omp_get_thread_num();
int num_threads = omp_get_num_threads();
#pragma omp for ordered
for (int i = 0; i < num_threads; i++) {
// do something
// #pragma omp ordered
// #pragma omp critical
std::cout << "Hello from thread " << tid << std::endl;
}
}
执行结果
#no synchronization
Hello from thread 0Hello from thread
Hello from thread 4
Hello from thread Hello from thread Hello from thread 7
Hello from thread 1
2
Hello from thread 5
6
3
# ordered
Hello from thread 0
Hello from thread 1
Hello from thread 2
Hello from thread 3
Hello from thread 4
Hello from thread 5
Hello from thread 6
Hello from thread 7
# critical
Hello from thread 5
Hello from thread 4
Hello from thread 1
Hello from thread 7
Hello from thread 6
Hello from thread 3
Hello from thread 2
Hello from thread 0
在并行区内对for循环进行线程划分,且for循环满足格式要求:
var=lb
形式,类型也有限制var relational-opb
或者b relational-op var
详细参考OpenMP API 4.5 Specification, p53
常常将parallel和for合并为parallel for制导语句
parallel for支持的从句:
parallel | for | parallel for | |
---|---|---|---|
if | √ | √ | |
num_threads | √ | √ | |
default | √ | √ | |
copyin | √ | √ | |
private | √ | √ | √ |
firstprivate | √ | √ | √ |
shared | √ | √ | √ |
reduction | √ | √ | √ |
lastprivate | √ | √ | |
schedule | √ | √ | |
ordered | √ | √ | |
collapse | √ | √ | |
nowait | √ |
lastprivate(list)
private
ordered
#pragma omp ordered
标记顺序执行代码(搭配使用)collapse(n)
:应用于n重循环
nowait
:取消代码块结束时的栅栏同步(barrier)schedule(type [, chunk])
:控制调度方式
static
:chunk大小固定(默认n/p)dynamic
:动态调度,chunk大小固定(默认为1)guided
:chunk大小动态缩减runtime
:由系统环境变量OMP_SCHEDULE决定schedule
见src/inner_product.cpp
for (int i = 0; i < N; i++) {
for (int j = i; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * A[j * N + k];
}
B[i * N + j] = sum;
B[j * N + i] = sum;
}
}
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < N; i++) {
for (int j = i; j < N; j++) {
double sum = 0;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * A[j * N + k];
}
B[i * N + j] = sum;
B[j * N + i] = sum;
}
}
执行结果
# export OMP_SCHEDULE="dynamic"
size: 1024
sequence time: 1.46233
omp time: 0.133192
# export OMP_SCHEDULE="static"
size: 1024
sequence time: 1.47874
omp time: 0.219114
见src/vector_norm.cpp
for (int i = 0; i < N; i++) {
ans_seq += b[i] * b[i];
}
ans_seq = sqrt(ans_seq);
#pragma omp parallel for reduction(+ : ans_omp)
for (int i = 0; i < N; i++) {
ans_omp += b[i] * b[i];
}
ans_omp = sqrt(ans_omp);
#pragma omp parallel for
for (int i = 0; i < N; i++) {
#pragma omp atomic
// #pragma omp critical
ans_omp_sync += b[i] * b[i];
}
ans_omp_sync = sqrt(ans_omp_sync);
执行结果
# atomic
size: 33554432
sequence result: 3344.05
omp result: 3344.05
omp sync result: 3344.05
sequence time: 0.0928805
omp time: 0.0180116
omp sync time: 5.17156
# critical
size: 33554432
sequence result: 3344.2
omp result: 3344.2
omp sync result: 3344.2
sequence time: 0.0929021
omp time: 0.0179938
omp sync time: 7.74378
omp_priv
omp_priv
和omp_in
一起顺序进行reduction,写回原变量#pragma omp sections
{
#pragma omp section
code1();
#pragma omp section
code2();
}
#pragma omp barrier
:在特定位置进行栅栏同步
#pragma omp single
:某段代码单线程执行,带隐式同步(使用nowait去掉)
#pragma omp master
:采用主线程执行,无隐式同步
#pragma omp critical
:某段代码线程互斥执行
#pragma omp atomic
:单个特定格式的语句或语句组中某个变量进行原子操作
见src/matrix_vector.cpp
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x_seq[i] += A[i * N + j] * b[j];
}
}
#pragma omp parallel for
for (int i = 0; i < N; i++) {
double tmp = 0;
for (int j = 0; j < N; j++) {
tmp += A[i * N + j] * b[j];
}
x_omp[i] = tmp;
}
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
x_omp_fs[i] += A[i * N + j] * b[j];
}
}
执行结果
size: 16384
sequence time: 1.07896
simple omp time: 0.0938018
false sharing time: 0.110479
size: 16384
sequence time: 1.38333
simple omp time: 0.0958252
false sharing time: 0.115473
size: 16384
sequence time: 1.0359
simple omp time: 0.0973124
false sharing time: 0.129693
aligned
用于列出内存对齐的指针safelen
用于标记循环展开时的数据依赖编译器也自带向量化功能,例如gcc:
-O3
-ffast-math
-fivopts
-march=native
-fopt-info-vec
-fopt-info-vec-missed