diff --git a/configure.ac b/configure.ac
index d54d3dc7efe01f4bcd67502c07e3a44a3b06a3a6..de0e1b35d4da78d103ddf27304ad527361a342c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -270,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true],
 AM_CONDITIONAL(HAVESETAFFINITY,
     [test "$ac_cv_func_pthread_setaffinity_np" = "yes"])
 
+# Check for libnuma.
+AC_CHECK_LIB([numa], [numa_available])
+
 # Check for timing functions needed by cycle.h.
 AC_HEADER_TIME
 AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
diff --git a/examples/main.c b/examples/main.c
index 338bbe27e682f57540232af3a111fcc1ee716ef6..d15bbfb0d0c28cb80540e86d36a185aabb1ece38 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -116,6 +116,21 @@ int main(int argc, char *argv[]) {
   /* Greeting message */
   if (myrank == 0) greetings();
 
+#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+  /* Ensure the NUMA node on which we initialise (first touch) everything
+   * doesn't change before engine_init allocates NUMA-local workers. Otherwise,
+   * we may be scheduled elsewhere between the two times.
+   */
+  cpu_set_t affinity;
+  CPU_ZERO(&affinity);
+  CPU_SET(sched_getcpu(), &affinity);
+  if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) {
+    message("failed to set entry thread's affinity");
+  } else {
+    message("set entry thread's affinity");
+  }
+#endif
+
   /* Init the space. */
   bzero(&s, sizeof(struct space));
 
diff --git a/src/engine.c b/src/engine.c
index cd4d6944ef43f1ac6d580384cb3a616e7644a03f..054e9c8a64585c602c08ef2d4aae2ecccf21aa21 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdbool.h>
 
 /* MPI headers. */
 #ifdef WITH_MPI
@@ -38,6 +39,10 @@
 #endif
 #endif
 
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#endif
+
 /* This object's header. */
 #include "engine.h"
 
@@ -2117,6 +2122,22 @@ void engine_split(struct engine *e, int *grid) {
   s->xparts = xparts_new;
 }
 
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+static bool hyperthreads_present(void) {
+#ifdef __linux__
+  FILE *f = fopen("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", "r");
+
+  int c;
+  while ((c = fgetc(f)) != EOF && c != ',');
+  fclose(f);
+
+  return c == ',';
+#else
+  return true; // just guess
+#endif
+}
+#endif
+
 /**
  * @brief init an engine with the given number of threads, queues, and
  *      the given policy.
@@ -2153,6 +2174,42 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
       for (j = maxint / i / 2; j < maxint; j += maxint / i)
         if (j < nr_cores && j != 0) cpuid[k++] = j;
 
+#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
+    /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */
+    if (numa_available() >= 0) {
+      if (nodeID == 0) message("prefer NUMA-local CPUs");
+
+      int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
+      bool done = false, swap_hyperthreads = hyperthreads_present();
+      if (swap_hyperthreads) message("prefer physical cores to hyperthreads");
+
+      while (!done) {
+        done = true;
+        for (i = 1; i < nr_cores; i++) {
+          int node_a = numa_node_of_cpu(cpuid[i-1]);
+          int node_b = numa_node_of_cpu(cpuid[i]);
+
+          /* Avoid using local hyperthreads over unused remote physical cores.
+           * Assume two hyperthreads, and that cpuid >= half partitions them.
+           */
+          int thread_a = swap_hyperthreads && cpuid[i-1] >= half;
+          int thread_b = swap_hyperthreads && cpuid[i] >= half;
+
+          bool swap = thread_a > thread_b;
+          if (thread_a == thread_b)
+            swap = numa_distance(home, node_a) > numa_distance(home, node_b);
+
+          if (swap) {
+            int t = cpuid[i-1];
+            cpuid[i-1] = cpuid[i];
+            cpuid[i] = t;
+            done = false;
+          }
+        }
+      }
+    }
+#endif
+
     if (nodeID == 0) {
 #ifdef WITH_MPI
       message("engine_init: cpu map is [ ");
diff --git a/src/kernel.h b/src/kernel.h
index d55e073479d1585f3677389939fa8e7c53dbe1c9..aead6a95adc35028834d671448223a31a57fc2b6 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -516,16 +516,17 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float x,
 #define kernel_name "Wendland C2"
 #define kernel_degree 5
 #define kernel_ivals 1
-#define kernel_gamma 1.f
-#define kernel_gamma2 1.f
-#define kernel_gamma3 1.f
-#define kernel_igamma 1.f
+#define kernel_gamma 2.f
+#define kernel_gamma2 4.f
+#define kernel_gamma3 8.f
+#define kernel_igamma 0.5f
 #define kernel_nwneigh                                                      \
   (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \
    7.261825f)
 static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
-    __attribute__((aligned(16))) = {4.0f, -15.0f, 20.0f, -10.0f, 0.0f, 1.0f,
-                                    0.0f, 0.0f,   0.0f,  0.0f,   0.0f, 0.0f};
+    __attribute__((aligned(16))) = {
+        0.05222272f, -0.39167037f, 1.04445431f, -1.04445431f, 0.f,  0.41778173f,
+        0.0f,        0.0f,         0.0f,        0.0f,         0.0f, 0.0f};
 #define kernel_root (kernel_coeffs[kernel_degree])
 #define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree])
 
@@ -537,7 +538,7 @@ static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
 __attribute__((always_inline)) INLINE static void kernel_deval(float x,
                                                                float *W,
                                                                float *dW_dx) {
-  int ind = fminf(x, kernel_ivals);
+  int ind = fminf(0.5f * x, kernel_ivals);
   float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
   float w = coeffs[0] * x + coeffs[1];
   float dw_dx = coeffs[0];
@@ -563,7 +564,7 @@ __attribute__((always_inline))
   int j, k;
 
   /* Load x and get the interval id. */
-  ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals)));
+  ind.m = vec_ftoi(vec_fmin(0.5f * x->v, vec_set1((float)kernel_ivals)));
 
   /* load the coefficients. */
   for (k = 0; k < VEC_SIZE; k++)
@@ -590,7 +591,7 @@ __attribute__((always_inline))
 
 __attribute__((always_inline)) INLINE static void kernel_eval(float x,
                                                               float *W) {
-  int ind = fmin(x, kernel_ivals);
+  int ind = fmin(0.5f * x, kernel_ivals);
   float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
   float w = coeffs[0] * x + coeffs[1];
   for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
diff --git a/theory/kernel/kernels.py b/theory/kernel/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7bdbe2bf9ba49a30f4c8a2ae136c4843ce5c2cf
--- /dev/null
+++ b/theory/kernel/kernels.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python2                                  
+# -*- coding: utf-8 -*-            
+import matplotlib
+matplotlib.use("Agg")
+from pylab import *
+from scipy import integrate
+import distinct_colours as colours
+from scipy.optimize import curve_fit
+from scipy.optimize import fsolve
+from matplotlib.font_manager import FontProperties
+import numpy
+
+params = {
+    'axes.labelsize': 8,
+    'axes.titlesize': 8,
+    'font.size': 8,
+    'legend.fontsize': 9,
+    'xtick.labelsize': 8,
+    'ytick.labelsize': 8,
+    'xtick.major.pad': 2.5,
+    'ytick.major.pad': 2.5,
+    'text.usetex': True,
+'figure.figsize' : (3.15,3.15),
+'figure.subplot.left'    : 0.12,
+'figure.subplot.right'   : 0.99  ,
+'figure.subplot.bottom'  : 0.08  ,
+'figure.subplot.top'     : 0.99  ,
+'figure.subplot.wspace'  : 0.  ,
+'figure.subplot.hspace'  : 0.  ,
+'lines.markersize' : 6,
+'lines.linewidth' : 2.,
+'text.latex.unicode': True
+}
+rcParams.update(params)
+rc('font',**{'family':'sans-serif','sans-serif':['Times']})
+
+
+#Parameters
+eta = 1.2349
+h = 2.1
+
+#Constants
+PI = math.pi
+
+#Cubic Spline
+cubic_kernel_degree = 3
+cubic_kernel_ivals = 2
+cubic_kernel_gamma = 2.
+cubic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 6.0858
+cubic_kernel_coeffs = array([[3./(4.*PI) , -3./(2.*PI), 0.,     1./PI],
+                             [-1./(4.*PI),  3./(2.*PI), -3./PI, 2./PI],
+                             [0.,           0.,         0.,     0.]])
+def cubic_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(x, cubic_kernel_ivals)).astype(int)
+    coeffs = cubic_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, cubic_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w
+
+
+#Quartic Spline
+quartic_kernel_degree = 4
+quartic_kernel_ivals = 3
+quartic_kernel_gamma = 2.5
+quartic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 8.2293
+quartic_kernel_coeffs = array([[3./(10.*PI) , 0.,           -3./(4.*PI) , 0.          ,  23./(32.*PI)],
+                               [-1./(5.*PI) , 1./PI       , -3./(2.*PI) ,1./(4.*PI)   ,  11./(16.*PI)],
+                               [1./(20.*PI) , -1./(2.*PI) , 15./(8.*PI) , -25./(8.*PI), 125./(64.*PI)],
+                               [ 0. , 0.,           0.,         0.,     0.]])
+def quartic_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(x+0.5, quartic_kernel_ivals)).astype(int)
+    coeffs = quartic_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, quartic_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w
+
+
+# Wendland kernel
+wendland2_kernel_degree = 5
+wendland2_kernel_ivals = 1
+wendland2_kernel_gamma = 2
+wendland2_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 7.261825
+wendland2_kernel_coeffs = 3.342253804929802 * array([[1./8, -30./32, 80./32, -80./32., 0., 1.],
+                                                     [ 0. , 0.,  0.,   0., 0., 0.]]) / 8.
+
+print wendland2_kernel_coeffs
+def wendland2_W(x):
+    if size(x) == 1:
+        x = array([x])
+    ind = (minimum(0.5*x, wendland2_kernel_ivals)).astype(int)
+    coeffs = wendland2_kernel_coeffs[ind,:]
+    w = coeffs[:,0] * x + coeffs[:,1]
+    for k in range(2, wendland2_kernel_degree+1):
+        w = x * w + coeffs[:,k]
+    return w 
+
+#def wendland2_W(x):
+#    if size(x) == 1:
+#        x = array([x])
+#    x /= 1.936492
+#    x[x>1.] = 1.
+#    oneminusu = 1.-x
+#    oneminusu4 = oneminusu * oneminusu * oneminusu * oneminusu
+#    return 3.342253804929802 * oneminusu4 * (1. + 4.*x) / h**3
+
+
+#Find H
+r = arange(0, 3.5*h, 1./1000.)
+xi = r/h
+cubic_Ws = cubic_W(xi)
+quartic_Ws = quartic_W(xi)
+wendland2_Ws = wendland2_W(xi)
+for j in range(size(r)):
+    if cubic_Ws[j] == 0:
+        cubic_H = r[j]
+        break
+for j in range(size(r)):
+    if quartic_Ws[j] == 0:
+        quartic_H = r[j]
+        break
+for j in range(size(r)):
+    if wendland2_Ws[j] == 0:
+        wendland2_H = r[j]
+        break
+
+    
+print "H=", cubic_H
+print "H=", quartic_H
+print "H=", wendland2_H
+
+
+# Compute sigma -----------------------------------------
+cubic_norm = 4.*PI*integrate.quad(lambda x: x**2*cubic_W(x), 0, cubic_H)[0]
+quartic_norm = 4.*PI*integrate.quad(lambda x: x**2*quartic_W(x), 0, quartic_H)[0]
+wendland2_norm = 4.*PI*integrate.quad(lambda x: x**2*wendland2_W(x), 0, wendland2_H)[0]
+
+print cubic_norm
+print quartic_norm
+print wendland2_norm
+
+
+# Plot kernels ------------------------------------------
+r = arange(0, 3.5*h, 1./100.)
+xi = r/h
+
+cubic_Ws = cubic_W(xi)
+quartic_Ws = quartic_W(xi)
+wendland2_Ws = wendland2_W(xi)
+
+
+
+figure()
+
+text(h-0.1, cubic_Ws[0]/20., "h", ha="right",va="center")
+arrow(h, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='k', ec='k', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1)
+
+
+plot(r,cubic_Ws, 'b-' ,label="Cubic")
+plot(r, quartic_Ws, 'r-', label="Quartic")
+plot(r, wendland2_Ws, 'g-', label="Wendland C2")
+
+text(cubic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='b')
+arrow(cubic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='b', ec='b', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1)
+
+text(quartic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r')
+arrow(quartic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='r', ec='r', length_includes_head=True, head_length=quartic_Ws[0]/30., head_width=0.1)
+
+text(wendland2_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r')
+arrow(wendland2_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='g', ec='g', length_includes_head=True, head_length=wendland2_Ws[0]/30., head_width=0.1)
+
+
+xlabel("r", labelpad=0)
+ylabel("W(r,h)", labelpad=0)
+legend(loc="upper right")
+savefig("kernel.pdf")
+