Don't overstep the affinity we're given at entry

1caa0a07 · Angus Lepper · 22234b65 · 1caa0a07 · 1caa0a07 · 1caa0a07
Commit 1caa0a07 authored 9 years ago by Angus Lepper
--- a/examples/main.c
+++ b/examples/main.c
@@ -138,17 +138,9 @@ int main(int argc, char *argv[]) {
  if ((ENGINE_POLICY) & engine_policy_setaffinity) {
    /* Ensure the NUMA node on which we initialise (first touch) everything
     * doesn't change before engine_init allocates NUMA-local workers.
-     * Otherwise,
-     * we may be scheduled elsewhere between the two times.
+     * Otherwise, we may be scheduled elsewhere between the two times.
     */
-    cpu_set_t affinity;
-    CPU_ZERO(&affinity);
-    CPU_SET(sched_getcpu(), &affinity);
-    if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
-      message("failed to set entry thread's affinity");
-    } else {
-      message("set entry thread's affinity");
-    }
+    engine_pin();
  }
 #endif


--- a/src/engine.c
+++ b/src/engine.c
@@ -1925,6 +1925,40 @@ static bool hyperthreads_present(void) {
 }
 #endif

+#ifdef HAVE_SETAFFINITY
+static cpu_set_t entry_affinity;
+static bool use_entry_affinity = false;
+
+static cpu_set_t *engine_entry_affinity(void) {
+  if (!use_entry_affinity) {
+    pthread_t engine = pthread_self();
+    pthread_getaffinity_np(engine, sizeof entry_affinity, &entry_affinity);
+    use_entry_affinity = true;
+  }
+
+  return &entry_affinity;
+}
+
+void engine_pin(void) {
+  cpu_set_t *entry_affinity = engine_entry_affinity();
+
+  int pin;
+  for (pin = 0; pin < CPU_SETSIZE && !CPU_ISSET(pin, entry_affinity); ++pin);
+
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(pin, &affinity);
+  if (sched_setaffinity(0, sizeof affinity, &affinity) != 0) {
+    error("failed to set engine's affinity");
+  }
+}
+
+static void engine_unpin(void) {
+  pthread_t main_thread = pthread_self();
+  pthread_setaffinity_np(main_thread, sizeof entry_affinity, &entry_affinity);
+}
+#endif
+
 /**
 * @brief init an engine with the given number of threads, queues, and
 *      the given policy.
@@ -1982,22 +2016,39 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,

 #if defined(HAVE_SETAFFINITY)
  const int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
-  int cpuid[nr_cores];
+  cpu_set_t *entry_affinity = engine_entry_affinity();
+  const int nr_affinity_cores = CPU_COUNT(entry_affinity);
+
+  if (nr_cores > CPU_SETSIZE) {
+    // Unlikely, except on e.g. SGI UV.
+    error("must allocate dynamic cpu_set_t (too many cores per node)");
+  }
+
+  char buf[nr_cores+1];
+  buf[nr_cores] = '\0';
+  for (int j = 0; j < nr_cores; ++j) {
+    /* Reversed bit order from convention, but same as e.g. Intel MPI's
+     * I_MPI_PIN_DOMAIN explicit mask: left-to-right, LSB-to-MSB.
+     */
+    buf[j] = CPU_ISSET(j, entry_affinity) ? '1' : '0';
+  }
+
+  message("affinity at entry: %s", buf);
+#endif
+
+  int cpuid[nr_affinity_cores];
  cpu_set_t cpuset;
-  if ((policy & engine_policy_cputight) == engine_policy_cputight) {
-    for (int k = 0; k < nr_cores; k++) cpuid[k] = k;
-  } else {
-    /*  Get next highest power of 2. */
-    int maxint = 1;
-    while (maxint < nr_cores) maxint *= 2;

-    cpuid[0] = 0;
-    int k = 1;
-    for (int i = 1; i < maxint; i *= 2)
-      for (int j = maxint / i / 2; j < maxint; j += maxint / i)
-        if (j < nr_cores && j != 0) cpuid[k++] = j;
+  int skip = 0;
+  for (int k = 0; k < nr_affinity_cores; k++) {
+    int c;
+    for (c = skip; c < CPU_SETSIZE && !CPU_ISSET(c, entry_affinity); ++c);
+    cpuid[k] = c;
+    skip = c + 1;
+  }

 #if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+  if ((policy & engine_policy_cputight) != engine_policy_cputight) {
    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
    if (numa_available() >= 0) {
      if (nodeID == 0) message("prefer NUMA-local CPUs");
@@ -2011,7 +2062,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,

      while (!done) {
        done = true;
-        for (int i = 1; i < nr_cores; i++) {
+        for (int i = 1; i < nr_affinity_cores; i++) {
          const int node_a = numa_node_of_cpu(cpuid[i - 1]);
          const int node_b = numa_node_of_cpu(cpuid[i]);

@@ -2034,20 +2085,24 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
        }
      }
    }
+  }
 #endif

-    if (nodeID == 0) {
+  /* Avoid (unexpected) interference between engine and runner threads. We can
+   * do this once we've made at least one call to engine_entry_affinity and
+   * maybe numa_node_of_cpu(sched_getcpu()), even if the engine isn't already
+   * pinned.
+   */
+  engine_unpin();
+
 #ifdef WITH_MPI
-      printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
-             clocks_get_timesincestart());
+  printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
+         clocks_get_timesincestart());
 #else
-      printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
-#endif
-      for (int i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]);
-      printf("].\n");
-    }
-  }
+  printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
 #endif
+  for (int i = 0; i < nr_affinity_cores; i++) printf("%i ", cpuid[i]);
+  printf("].\n");

  /* Are we doing stuff in parallel? */
  if (nr_nodes > 1) {
@@ -2176,15 +2231,16 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
 #if defined(HAVE_SETAFFINITY)

      /* Set a reasonable queue ID. */
-      e->runners[k].cpuid = cpuid[k % nr_cores];
+      int coreid = k % nr_affinity_cores;
+      e->runners[k].cpuid = cpuid[coreid];
      if (nr_queues < nr_threads)
-        e->runners[k].qid = cpuid[k % nr_cores] * nr_queues / nr_cores;
+        e->runners[k].qid = cpuid[coreid] * nr_queues / nr_affinity_cores;
      else
        e->runners[k].qid = k;

      /* Set the cpu mask to zero | e->id. */
      CPU_ZERO(&cpuset);
-      CPU_SET(cpuid[k % nr_cores], &cpuset);
+      CPU_SET(cpuid[coreid], &cpuset);

      /* Apply this mask to the runner's pthread. */
      if (pthread_setaffinity_np(e->runners[k].thread, sizeof(cpu_set_t),

--- a/src/engine.h
+++ b/src/engine.h
@@ -170,6 +170,7 @@ struct engine {

 /* Function prototypes. */
 void engine_barrier(struct engine *e, int tid);
+void engine_pin(void);
 void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
                 int nr_queues, int nr_nodes, int nodeID, int policy,
                 float timeBegin, float timeEnd, float dt_min, float dt_max,