diff --git a/configure.ac b/configure.ac index d54d3dc7efe01f4bcd67502c07e3a44a3b06a3a6..de0e1b35d4da78d103ddf27304ad527361a342c7 100644 --- a/configure.ac +++ b/configure.ac @@ -270,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true], AM_CONDITIONAL(HAVESETAFFINITY, [test "$ac_cv_func_pthread_setaffinity_np" = "yes"]) +# Check for libnuma. +AC_CHECK_LIB([numa], [numa_available]) + # Check for timing functions needed by cycle.h. AC_HEADER_TIME AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) diff --git a/examples/main.c b/examples/main.c index 338bbe27e682f57540232af3a111fcc1ee716ef6..d15bbfb0d0c28cb80540e86d36a185aabb1ece38 100644 --- a/examples/main.c +++ b/examples/main.c @@ -116,6 +116,21 @@ int main(int argc, char *argv[]) { /* Greeting message */ if (myrank == 0) greetings(); +#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE) + /* Ensure the NUMA node on which we initialise (first touch) everything + * doesn't change before engine_init allocates NUMA-local workers. Otherwise, + * we may be scheduled elsewhere between the two times. + */ + cpu_set_t affinity; + CPU_ZERO(&affinity); + CPU_SET(sched_getcpu(), &affinity); + if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) { + message("failed to set entry thread's affinity"); + } else { + message("set entry thread's affinity"); + } +#endif + /* Init the space. */ bzero(&s, sizeof(struct space)); diff --git a/src/engine.c b/src/engine.c index cd4d6944ef43f1ac6d580384cb3a616e7644a03f..d3ced1ccca8ca4a176d8f089e558abc3c30aa725 100644 --- a/src/engine.c +++ b/src/engine.c @@ -28,6 +28,7 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <stdbool.h> /* MPI headers. */ #ifdef WITH_MPI @@ -38,6 +39,10 @@ #endif #endif +#ifdef HAVE_LIBNUMA +#include <numa.h> +#endif + /* This object's header. */ #include "engine.h" @@ -2153,6 +2158,40 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, for (j = maxint / i / 2; j < maxint; j += maxint / i) if (j < nr_cores && j != 0) cpuid[k++] = j; +#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE) + /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */ + if (numa_available() >= 0) { + if (nodeID == 0) message("prefer NUMA-local CPUs"); + + int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2; + bool done = false; + while (!done) { + done = true; + for (i = 1; i < nr_cores; i++) { + int node_a = numa_node_of_cpu(cpuid[i-1]); + int node_b = numa_node_of_cpu(cpuid[i]); + + /* Avoid using local hyperthreads over unused remote physical cores. + * Assume two hyperthreads, and that cpuid >= half partitions them. + */ + int thread_a = cpuid[i-1] >= half; + int thread_b = cpuid[i] >= half; + + bool swap = thread_a > thread_b; + if (thread_a == thread_b) + swap = numa_distance(home, node_a) > numa_distance(home, node_b); + + if (swap) { + int t = cpuid[i-1]; + cpuid[i-1] = cpuid[i]; + cpuid[i] = t; + done = false; + } + } + } + } +#endif + if (nodeID == 0) { #ifdef WITH_MPI message("engine_init: cpu map is [ ");