diff --git a/examples/main.c b/examples/main.c
index b607791729ac0d419a227edae72814542898d931..ca3a696816369d6edac261bd3daa0fff31f84a91 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -52,6 +52,7 @@ void print_help_message() {
 
   printf("\nUsage: swift [OPTION] PARAMFILE\n\n");
   printf("Valid options are:\n");
+  printf("  %2s %8s %s\n", "-a", "[01]", "Use processor affinity");
   printf("  %2s %8s %s\n", "-c", "", "Run with cosmological time integration");
   printf(
       "  %2s %8s %s\n", "-d", "",
@@ -132,6 +133,7 @@ int main(int argc, char *argv[]) {
   /* Welcome to SWIFT, you made the right choice */
   if (myrank == 0) greetings();
 
+  int with_aff = 1;
   int dry_run = 0;
   int dump_tasks = 0;
   int with_cosmology = 0;
@@ -146,7 +148,14 @@ int main(int argc, char *argv[]) {
 
   /* Parse the parameters */
   int c;
-  while ((c = getopt(argc, argv, "cdef:gGhst:v:y:")) != -1) switch (c) {
+  while ((c = getopt(argc, argv, "a:cdef:gGhst:v:y:")) != -1) switch (c) {
+      case 'a':
+        if (sscanf(optarg, "%d", &with_aff) != 1) {
+          if (myrank == 0) printf("Error parsing affinity switch (-a).\n");
+          if (myrank == 0) print_help_message();
+          return 1;
+        }
+        break;
       case 'c':
         with_cosmology = 1;
         break;
@@ -411,8 +420,9 @@ int main(int argc, char *argv[]) {
   /* Initialize the engine with the space and policies. */
   if (myrank == 0) clocks_gettime(&tic);
   struct engine e;
-  engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, engine_policies,
-              talking, &prog_const, &hydro_properties, &potential);
+  engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, with_aff,
+              engine_policies, talking, &prog_const, &hydro_properties,
+              &potential);
   if (myrank == 0) {
     clocks_gettime(&toc);
     message("engine_init took %.3f %s.", clocks_diff(&tic, &toc),
diff --git a/src/engine.c b/src/engine.c
index 31cc9abd629b08bf1575e2b92f0b3afd4d00d5df..9751b93fc81098f243505d2310d42a599b239683 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2445,6 +2445,7 @@ void engine_unpin() {
  * @param nr_nodes The number of MPI ranks.
  * @param nodeID The MPI rank of this node.
  * @param nr_threads The number of threads per MPI rank.
+ * @param with_aff use processor affinity, if supported.
  * @param policy The queuing policy to use.
  * @param verbose Is this #engine talkative ?
  * @param physical_constants The #phys_const used for this run.
@@ -2454,7 +2455,7 @@ void engine_unpin() {
 
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
-                 int nr_threads, int policy, int verbose,
+                 int nr_threads, int with_aff, int policy, int verbose,
                  const struct phys_const *physical_constants,
                  const struct hydro_props *hydro,
                  const struct external_potential *potential) {
@@ -2527,7 +2528,7 @@ void engine_init(struct engine *e, struct space *s,
     buf[j] = CPU_ISSET(j, entry_affinity) ? '1' : '0';
   }
 
-  if (verbose) message("Affinity at entry: %s", buf);
+  if (verbose && with_aff) message("Affinity at entry: %s", buf);
 
   int *cpuid = malloc(nr_affinity_cores * sizeof(int));
   cpu_set_t cpuset;
@@ -2541,51 +2542,87 @@ void engine_init(struct engine *e, struct space *s,
     skip = c + 1;
   }
 
+  if (with_aff) {
+
 #if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
-  if ((policy & engine_policy_cputight) != engine_policy_cputight) {
-    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
-    if (numa_available() >= 0) {
-      if (nodeID == 0) message("prefer NUMA-local CPUs");
-
-      const int home = numa_node_of_cpu(sched_getcpu());
-      int done = 0;
-
-      while (!done) {
-        done = 1;
-        for (int i = 1; i < nr_affinity_cores; i++) {
-          const int node_a = numa_node_of_cpu(cpuid[i - 1]);
-          const int node_b = numa_node_of_cpu(cpuid[i]);
-
-          const int swap =
-              numa_distance(home, node_a) > numa_distance(home, node_b);
-
-          if (swap) {
-            const int t = cpuid[i - 1];
-            cpuid[i - 1] = cpuid[i];
-            cpuid[i] = t;
-            done = 0;
+    if ((policy & engine_policy_cputight) != engine_policy_cputight) {
+
+      if (numa_available() >= 0) {
+        if (nodeID == 0) message("prefer NUMA-distant CPUs");
+
+        /* Get list of numa nodes of all available cores. */
+        int *nodes = malloc( nr_affinity_cores * sizeof(int));
+        int nnodes = 0;
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          nodes[i] = numa_node_of_cpu(cpuid[i]);
+          if (nodes[i] > nnodes) nnodes = nodes[i];
+        }
+        nnodes += 1;
+
+        /* Count cores per node. */
+        int *core_counts = malloc( nnodes * sizeof(int));
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          core_counts[nodes[i]] = 0;
+        }
+        for (int i = 0; i < nr_affinity_cores; i++) {
+          core_counts[nodes[i]] += 1;
+        }
+
+        /* Index cores within each node. */
+        int *core_indices = malloc( nr_affinity_cores * sizeof(int));
+        for (int i = nr_affinity_cores - 1; i >= 0; i--) {
+          core_indices[i] = core_counts[nodes[i]];
+          core_counts[nodes[i]] -= 1;
+        }
+
+        /* Now sort so that we pick adjacent cpuids from different nodes
+         * by sorting internal node core indices. */
+        int done = 0;
+        while (!done) {
+          done = 1;
+          for (int i = 1; i < nr_affinity_cores; i++) {
+            if ( core_indices[i] < core_indices[i-1] ) {
+              int t = cpuid[i-1];
+              cpuid[i-1] = cpuid[i];
+              cpuid[i] = t;
+
+              t = core_indices[i-1];
+              core_indices[i-1] = core_indices[i];
+              core_indices[i] = t;
+              done = 0;
+            }
           }
         }
+
+        free(nodes);
+        free(core_counts);
+        free(core_indices);
       }
     }
-  }
 #endif
+  }
+  else {
+    if (nodeID == 0) message("no processor affinity used");
+
+  }/* with_aff */
 
   /* Avoid (unexpected) interference between engine and runner threads. We can
    * do this once we've made at least one call to engine_entry_affinity and
    * maybe numa_node_of_cpu(sched_getcpu()), even if the engine isn't already
-   * pinned. */
+   * pinned. Also unpin this when asked to not pin at all (!with_aff). */
   engine_unpin();
 #endif
 
+  if (with_aff) {
 #ifdef WITH_MPI
-  printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
-         clocks_get_timesincestart());
+    printf("[%04i] %s engine_init: cpu map is [ ", nodeID,
+           clocks_get_timesincestart());
 #else
-  printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
+    printf("%s engine_init: cpu map is [ ", clocks_get_timesincestart());
 #endif
-  for (int i = 0; i < nr_affinity_cores; i++) printf("%i ", cpuid[i]);
-  printf("].\n");
+    for (int i = 0; i < nr_affinity_cores; i++) printf("%i ", cpuid[i]);
+    printf("].\n");
+  }
 
   /* Are we doing stuff in parallel? */
   if (nr_nodes > 1) {
@@ -2730,12 +2767,13 @@ void engine_init(struct engine *e, struct space *s,
       error("Failed to create runner thread.");
 
     /* Try to pin the runner to a given core */
-    if ((e->policy & engine_policy_setaffinity) == engine_policy_setaffinity) {
+    if (with_aff && (e->policy & engine_policy_setaffinity) == engine_policy_setaffinity) {
 #if defined(HAVE_SETAFFINITY)
 
       /* Set a reasonable queue ID. */
       int coreid = k % nr_affinity_cores;
       e->runners[k].cpuid = cpuid[coreid];
+
       if (nr_queues < e->nr_threads)
         e->runners[k].qid = cpuid[coreid] * nr_queues / nr_affinity_cores;
       else
@@ -2757,15 +2795,22 @@ void engine_init(struct engine *e, struct space *s,
       e->runners[k].cpuid = k;
       e->runners[k].qid = k * nr_queues / e->nr_threads;
     }
-    if (verbose)
-      message("runner %i on cpuid=%i with qid=%i.", e->runners[k].id,
-              e->runners[k].cpuid, e->runners[k].qid);
+    if (verbose) {
+      if (with_aff) 
+        message("runner %i on cpuid=%i with qid=%i.", e->runners[k].id,
+                e->runners[k].cpuid, e->runners[k].qid);
+      else
+        message("runner %i using qid=%i no cpuid.", e->runners[k].id,
+                e->runners[k].qid);
+    }
   }
 
 /* Free the affinity stuff */
 #if defined(HAVE_SETAFFINITY)
-  free(cpuid);
-  free(buf);
+  if (with_aff) {
+    free(cpuid);
+    free(buf);
+  }
 #endif
 
   /* Wait for the runner threads to be in place. */
diff --git a/src/engine.h b/src/engine.h
index 15abc4b6393fec97582e2ec3c7551f5f64f9a0e4..6e7987ee9ec35727a2b889dba8a6fb8ee4da6088 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -197,7 +197,7 @@ void engine_compute_next_snapshot_time(struct engine *e);
 void engine_dump_snapshot(struct engine *e);
 void engine_init(struct engine *e, struct space *s,
                  const struct swift_params *params, int nr_nodes, int nodeID,
-                 int nr_threads, int policy, int verbose,
+                 int nr_threads, int with_aff, int policy, int verbose, 
                  const struct phys_const *physical_constants,
                  const struct hydro_props *hydro,
                  const struct external_potential *potential);