diff --git a/configure.ac b/configure.ac
index d54d3dc7efe01f4bcd67502c07e3a44a3b06a3a6..de0e1b35d4da78d103ddf27304ad527361a342c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -270,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
 AM_CONDITIONAL(HAVESETAFFINITY,
     [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
 
+# Check for libnuma.
+AC_CHECK_LIB([numa], [numa_available])
+
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
 AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
diff --git a/examples/main.c b/examples/main.c
index 338bbe27e682f57540232af3a111fcc1ee716ef6..d15bbfb0d0c28cb80540e86d36a185aabb1ece38 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -116,6 +116,21 @@ int main(int argc, char *argv[]) {
   /* Greeting message */
   if (myrank == 0) greetings();
 
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+  /* Ensure the NUMA node on which we initialise (first touch) everything
+   * doesn't change before engine_init allocates NUMA-local workers. Otherwise,
+   * we may be scheduled elsewhere between the two times.
+   */
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(sched_getcpu(), &affinity);
+  if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
+    message("failed to set entry thread's affinity");
+  } else {
+    message("set entry thread's affinity");
+  }
+#endif
+
   /* Init the space. */
   bzero(&s, sizeof(struct space));
 
diff --git a/src/engine.c b/src/engine.c
index cd4d6944ef43f1ac6d580384cb3a616e7644a03f..d3ced1ccca8ca4a176d8f089e558abc3c30aa725 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdbool.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -38,6 +39,10 @@
 #endif
 #endif
 
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -2153,6 +2158,40 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
       for (j = maxint / i / 2; j < maxint; j += maxint / i)
         if (j < nr_cores && j != 0) cpuid[k++] = j;
 
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
+    if (numa_available() >= 0) {
+      if (nodeID == 0) message("prefer NUMA-local CPUs");
+
+      int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
+      bool done = false;
+      while (!done) {
+        done = true;
+        for (i = 1; i < nr_cores; i++) {
+          int node_a = numa_node_of_cpu(cpuid[i-1]);
+          int node_b = numa_node_of_cpu(cpuid[i]);
+
+          /* Avoid using local hyperthreads over unused remote physical cores.
+           * Assume two hyperthreads, and that cpuid >= half partitions them.
+           */
+          int thread_a = cpuid[i-1] >= half;
+          int thread_b = cpuid[i] >= half;
+
+          bool swap = thread_a > thread_b;
+          if (thread_a == thread_b)
+            swap = numa_distance(home, node_a) > numa_distance(home, node_b);
+
+          if (swap) {
+            int t = cpuid[i-1];
+            cpuid[i-1] = cpuid[i];
+            cpuid[i] = t;
+            done = false;
+          }
+        }
+      }
+    }
+#endif
+
     if (nodeID == 0) {
 #ifdef WITH_MPI
       message("engine_init: cpu map is [ ");