Contexto: Tenho escrito um programa multithread que usa atomics extensivamente. Percebi que esses atomics são muito lentos, especialmente no ARM, porque o compilador inseriu muitas fences, às vezes até dentro de loops. Então, quero eliminar as desnecessárias usando ordens de memória.
Eu tropecei neste caso, mas não tenho certeza se é seguro usar uma carga relaxada ou não. Veja este exemplo simples de leitura de parâmetros:
typedef struct {
big_struct Data;
_Atomic bool bDataReadDone;
} worker_thread_parameter;
static int WorkerThreadFunction(void* Parameter) {
// Read Data
worker_thread_parameter* pWorkerParameter = Parameter;
big_struct Data = pWorkerParameter->Data;
// Notify that reading Data is done
// Use release store to ensure Data is read before this.
atomic_store_explicit(&pWorkerParameter->bDataReadDone, true, memory_order_release);
// Do something with Data
}
int main() {
thrd_t aWorkerThread[8];
for (size_t i = 0; i < 8; ++i) {
worker_thread_parameter WorkerParameter = { /* Data = something */, false };
thrd_create(&aWorkerThread[i], WorkerThreadFunction, &WorkerParameter);
// Wait for Data to be read
// Use relaxed load because this thread doesn't read Data anymore,
// so we don't need to synchronize with the flag.
while (!atomic_load_explicit(&WorkerParameter.bDataReadDone, memory_order_relaxed));
}
}
Ou este exemplo:
// Initialized before the threads are started
_Atomic bool bUsingData = true;
big_struct* pData = malloc(sizeof(*pData));
static int WorkerThread() {
Use(pData);
// Notify the cleaner thread to free Data
// Use release store to ensure Data is used before this.
atomic_store_explicit(&bUsingData, false, memory_order_release);
}
static int CleanerThread() {
// Use relaxed load because this thread doesn't read Data anymore,
// so we don't need to synchronize with the flag.
while (atomic_load_explicit(bUsingData, memory_order_relaxed));
free(pData);
}
E este exemplo:
_Atomic int X = 0;
_Atomic int Y = 0;
// Thread 1
atomic_store_explicit(&X, 99, memory_order_relaxed);
atomic_store_explicit(&Y, 1, memory_order_release);
// Thread 2
if (atomic_load_explicit(&Y, memory_order_relaxed)) {
atomic_store_explicit(&X, 100, memory_order_relaxed);
printf("%i", atomic_load_explicit(&X, memory_order_relaxed));
}
// Does thread 2 always prints 100?