A quick measurement of the impact of false sharing on simple spinlocks

November 2, 2020

We'll measure the impact of false-sharing on a simple spinlock. Our spinlock is the simplest possible implementation:

// spinlock.h
#pragma once

#include <stdatomic.h>
#include <stdbool.h>

#define SPINLOCK_INIT                                                          \
  (spinlock) { 0 }

enum spinlock_status
{
  LOCK_ACQUIRED,
  LOCK_BUSY
};

typedef struct spinlock
{
  _Atomic(bool) locked;
} spinlock;

static inline enum spinlock_status
spinlock_try_lock(spinlock* lock)
{
  const bool busy =
    atomic_exchange_explicit(&lock->locked, true, memory_order_acquire);
  return busy ? LOCK_BUSY : LOCK_ACQUIRED;
}

static inline void
spinlock_unlock(spinlock* lock)
{
  atomic_exchange_explicit(&lock->locked, false, memory_order_release);
}

And our driver consists of two threads, each with their own lock. The threads simply take and release the lock on each iteration of a loop. Our loop will do 50 million iterations.

In the first run, the locks will be placed in the same cache-line. In the second run, we ensure the locks are positioned in different cache-lines.

// false_sharing.c
#include <spinlock.h>

#include <assert.h>
#include <stdalign.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <threads.h>
#include <time.h>

#define ITERATIONS 50000000
#define CACHE_LINE_SIZE 64

#define CHECK(cond) if (!(cond)) { fprintf(stderr, "%s failed\n", #cond); abort(); }

int
worker(void* arg)
{
  spinlock* const lock = (spinlock*)arg;

  for (size_t iter = 0; iter < ITERATIONS; ++iter) {
    while (spinlock_try_lock(lock) == LOCK_BUSY) {
    }
    spinlock_unlock(lock);
  }

  return 0;
}

int
main(int argc, char* argv[])
{
  (void)argc;
  (void)argv;

  assert(sizeof(spinlock) == 1);

  struct
  {
    alignas(CACHE_LINE_SIZE) spinlock first;
    spinlock second_close;
    alignas(CACHE_LINE_SIZE) spinlock second_far;
  } locks = { 0 };

  thrd_t worker_a, worker_b;

  // Locks share a cache line
  {
    clock_t start = clock();

    CHECK(thrd_create(&worker_a, worker, &locks.first) == thrd_success);
    CHECK(thrd_create(&worker_b, worker, &locks.second_close) == thrd_success);
    thrd_join(worker_a, NULL);
    thrd_join(worker_b, NULL);

    clock_t end = clock();
    printf("Cache line shared: %g (CPU seconds)\n",
           ((double)(end - start)) / CLOCKS_PER_SEC);
  }

  // Locks don't share a cache line
  {
    clock_t start = clock();

    CHECK(thrd_create(&worker_a, worker, &locks.first) == thrd_success);
    CHECK(thrd_create(&worker_b, worker, &locks.second_far) == thrd_success);
    thrd_join(worker_a, NULL);
    thrd_join(worker_b, NULL);

    clock_t end = clock();
    printf("Cache line unshared: %g (CPU seconds)\n",
           ((double)(end - start)) / CLOCKS_PER_SEC);
  }

  return 0;
}

The use of a structure to hold the locks helps guarantee the relative positioning of the spinlocks.

Compiling and running this gives output:

$ false_sharing
Cache line shared: 11.9485 (CPU seconds)
Cache line unshared: 1.69763 (CPU seconds)

This shows false sharing can have quite an impact on performance (5-6x slowdown). This is by far not a perfect benchmark, and maximizes the impact of the phenomenon.