From 61b0165ee7694b59326646d01b704aa9d3aa53d2 Mon Sep 17 00:00:00 2001
From: Angus Lepper <angus.lepper@ed.ac.uk>
Date: Mon, 16 Nov 2015 15:02:48 +0000
Subject: [PATCH] NUMA-aware affinity for workers

* This is not vectorisation-specific.
* There may be better trade-offs between HT and NUMA.
* Maybe print a warning when we require multiple NUMA nodes.
* Must also detect when Hyper-Threading is not present.
* Probably better as a configure flag, rather than conditional only upon
  the availability of libnuma.
* ~15-40% performance improvement on COSMA.
---
 configure.ac    |  3 +++
 examples/main.c | 15 +++++++++++++++
 src/engine.c    | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

diff --git a/configure.ac b/configure.ac
index d54d3dc7ef..de0e1b35d4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -270,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
 AM_CONDITIONAL(HAVESETAFFINITY,
     [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
 
+# Check for libnuma.
+AC_CHECK_LIB([numa], [numa_available])
+
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
 AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
diff --git a/examples/main.c b/examples/main.c
index 338bbe27e6..d15bbfb0d0 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -116,6 +116,21 @@ int main(int argc, char *argv[]) {
   /* Greeting message */
   if (myrank == 0) greetings();
 
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+  /* Ensure the NUMA node on which we initialise (first touch) everything
+   * doesn't change before engine_init allocates NUMA-local workers. Otherwise,
+   * we may be scheduled elsewhere between the two times.
+   */
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(sched_getcpu(), &affinity);
+  if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
+    message("failed to set entry thread's affinity");
+  } else {
+    message("set entry thread's affinity");
+  }
+#endif
+
   /* Init the space. */
   bzero(&s, sizeof(struct space));
 
diff --git a/src/engine.c b/src/engine.c
index cd4d6944ef..d3ced1ccca 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdbool.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -38,6 +39,10 @@
 #endif
 #endif
 
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -2153,6 +2158,40 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
       for (j = maxint / i / 2; j < maxint; j += maxint / i)
         if (j < nr_cores && j != 0) cpuid[k++] = j;
 
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
+    if (numa_available() >= 0) {
+      if (nodeID == 0) message("prefer NUMA-local CPUs");
+
+      int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
+      bool done = false;
+      while (!done) {
+        done = true;
+        for (i = 1; i < nr_cores; i++) {
+          int node_a = numa_node_of_cpu(cpuid[i-1]);
+          int node_b = numa_node_of_cpu(cpuid[i]);
+
+          /* Avoid using local hyperthreads over unused remote physical cores.
+           * Assume two hyperthreads, and that cpuid >= half partitions them.
+           */
+          int thread_a = cpuid[i-1] >= half;
+          int thread_b = cpuid[i] >= half;
+
+          bool swap = thread_a > thread_b;
+          if (thread_a == thread_b)
+            swap = numa_distance(home, node_a) > numa_distance(home, node_b);
+
+          if (swap) {
+            int t = cpuid[i-1];
+            cpuid[i-1] = cpuid[i];
+            cpuid[i] = t;
+            done = false;
+          }
+        }
+      }
+    }
+#endif
+
     if (nodeID == 0) {
 #ifdef WITH_MPI
       message("engine_init: cpu map is [ ");
-- 
GitLab