背景:我一直在编写一个大量使用原子的多线程程序。我注意到这些原子非常慢,尤其是在 ARM 上,因为编译器插入了太多的栅栏,有时甚至在循环内。所以我想使用内存顺序消除不必要的栅栏。
我偶然遇到过这种情况,但我不确定使用放松负载是否安全。以这个简单的参数读取示例为例:
typedef struct {
big_struct Data;
_Atomic bool bDataReadDone;
} worker_thread_parameter;
static int WorkerThreadFunction(void* Parameter) {
// Read Data
worker_thread_parameter* pWorkerParameter = Parameter;
big_struct Data = pWorkerParameter->Data;
// Notify that reading Data is done
// Use release store to ensure Data is read before this.
atomic_store_explicit(&pWorkerParameter->bDataReadDone, true, memory_order_release);
// Do something with Data
}
int main() {
thrd_t aWorkerThread[8];
for (size_t i = 0; i < 8; ++i) {
worker_thread_parameter WorkerParameter = { /* Data = something */, false };
thrd_create(&aWorkerThread[i], WorkerThreadFunction, &WorkerParameter);
// Wait for Data to be read
// Use relaxed load because this thread doesn't read Data anymore,
// so we don't need to synchronize with the flag.
while (!atomic_load_explicit(&WorkerParameter.bDataReadDone, memory_order_relaxed));
}
}
或者这个例子:
// Initialized before the threads are started
_Atomic bool bUsingData = true;
big_struct* pData = malloc(sizeof(*pData));
static int WorkerThread() {
Use(pData);
// Notify the cleaner thread to free Data
// Use release store to ensure Data is used before this.
atomic_store_explicit(&bUsingData, false, memory_order_release);
}
static int CleanerThread() {
// Use relaxed load because this thread doesn't read Data anymore,
// so we don't need to synchronize with the flag.
while (atomic_load_explicit(bUsingData, memory_order_relaxed));
free(pData);
}
这个例子:
_Atomic int X = 0;
_Atomic int Y = 0;
// Thread 1
atomic_store_explicit(&X, 99, memory_order_relaxed);
atomic_store_explicit(&Y, 1, memory_order_release);
// Thread 2
if (atomic_load_explicit(&Y, memory_order_relaxed)) {
atomic_store_explicit(&X, 100, memory_order_relaxed);
printf("%i", atomic_load_explicit(&X, memory_order_relaxed));
}
// Does thread 2 always prints 100?