A quick measurement of the impact of false sharing on simple spinlocks
November 2, 2020
We'll measure the impact of false-sharing on a simple spinlock. Our spinlock is the simplest possible implementation:
// spinlock.h
#pragma once
#include <stdatomic.h>
#include <stdbool.h>
#define SPINLOCK_INIT \
(spinlock) { 0 }
enum spinlock_status
{
LOCK_ACQUIRED,
LOCK_BUSY
};
typedef struct spinlock
{
_Atomic(bool) locked;
} spinlock;
static inline enum spinlock_status
spinlock_try_lock(spinlock* lock)
{
const bool busy =
atomic_exchange_explicit(&lock->locked, true, memory_order_acquire);
return busy ? LOCK_BUSY : LOCK_ACQUIRED;
}
static inline void
spinlock_unlock(spinlock* lock)
{
atomic_exchange_explicit(&lock->locked, false, memory_order_release);
}
And our driver consists of two threads, each with their own lock. The threads simply take and release the lock on each iteration of a loop. Our loop will do 50 million iterations.
In the first run, the locks will be placed in the same cache-line. In the second run, we ensure the locks are positioned in different cache-lines.
// false_sharing.c
#include <spinlock.h>
#include <assert.h>
#include <stdalign.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <threads.h>
#include <time.h>
#define ITERATIONS 50000000
#define CACHE_LINE_SIZE 64
#define CHECK(cond) if (!(cond)) { fprintf(stderr, "%s failed\n", #cond); abort(); }
int
worker(void* arg)
{
spinlock* const lock = (spinlock*)arg;
for (size_t iter = 0; iter < ITERATIONS; ++iter) {
while (spinlock_try_lock(lock) == LOCK_BUSY) {
}
spinlock_unlock(lock);
}
return 0;
}
int
main(int argc, char* argv[])
{
(void)argc;
(void)argv;
assert(sizeof(spinlock) == 1);
struct
{
alignas(CACHE_LINE_SIZE) spinlock first;
spinlock second_close;
alignas(CACHE_LINE_SIZE) spinlock second_far;
} locks = { 0 };
thrd_t worker_a, worker_b;
// Locks share a cache line
{
clock_t start = clock();
CHECK(thrd_create(&worker_a, worker, &locks.first) == thrd_success);
CHECK(thrd_create(&worker_b, worker, &locks.second_close) == thrd_success);
thrd_join(worker_a, NULL);
thrd_join(worker_b, NULL);
clock_t end = clock();
printf("Cache line shared: %g (CPU seconds)\n",
((double)(end - start)) / CLOCKS_PER_SEC);
}
// Locks don't share a cache line
{
clock_t start = clock();
CHECK(thrd_create(&worker_a, worker, &locks.first) == thrd_success);
CHECK(thrd_create(&worker_b, worker, &locks.second_far) == thrd_success);
thrd_join(worker_a, NULL);
thrd_join(worker_b, NULL);
clock_t end = clock();
printf("Cache line unshared: %g (CPU seconds)\n",
((double)(end - start)) / CLOCKS_PER_SEC);
}
return 0;
}
The use of a structure to hold the locks helps guarantee the relative positioning of the spinlocks.
Compiling and running this gives output:
$ false_sharing
Cache line shared: 11.9485 (CPU seconds)
Cache line unshared: 1.69763 (CPU seconds)
This shows false sharing can have quite an impact on performance (5-6x slowdown). This is by far not a perfect benchmark, and maximizes the impact of the phenomenon.