diff --git a/configure.ac b/configure.ac index d54d3dc7efe01f4bcd67502c07e3a44a3b06a3a6..de0e1b35d4da78d103ddf27304ad527361a342c7 100644 --- a/configure.ac +++ b/configure.ac @@ -270,6 +270,9 @@ AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[true], AM_CONDITIONAL(HAVESETAFFINITY, [test "$ac_cv_func_pthread_setaffinity_np" = "yes"]) +# Check for libnuma. +AC_CHECK_LIB([numa], [numa_available]) + # Check for timing functions needed by cycle.h. AC_HEADER_TIME AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) diff --git a/examples/main.c b/examples/main.c index 338bbe27e682f57540232af3a111fcc1ee716ef6..d15bbfb0d0c28cb80540e86d36a185aabb1ece38 100644 --- a/examples/main.c +++ b/examples/main.c @@ -116,6 +116,21 @@ int main(int argc, char *argv[]) { /* Greeting message */ if (myrank == 0) greetings(); +#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE) + /* Ensure the NUMA node on which we initialise (first touch) everything + * doesn't change before engine_init allocates NUMA-local workers. Otherwise, + * we may be scheduled elsewhere between the two times. + */ + cpu_set_t affinity; + CPU_ZERO(&affinity); + CPU_SET(sched_getcpu(), &affinity); + if (sched_setaffinity(0, sizeof(cpu_set_t), &affinity) != 0) { + message("failed to set entry thread's affinity"); + } else { + message("set entry thread's affinity"); + } +#endif + /* Init the space. */ bzero(&s, sizeof(struct space)); diff --git a/src/engine.c b/src/engine.c index cd4d6944ef43f1ac6d580384cb3a616e7644a03f..054e9c8a64585c602c08ef2d4aae2ecccf21aa21 100644 --- a/src/engine.c +++ b/src/engine.c @@ -28,6 +28,7 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#include <stdbool.h> /* MPI headers. */ #ifdef WITH_MPI @@ -38,6 +39,10 @@ #endif #endif +#ifdef HAVE_LIBNUMA +#include <numa.h> +#endif + /* This object's header. */ #include "engine.h" @@ -2117,6 +2122,22 @@ void engine_split(struct engine *e, int *grid) { s->xparts = xparts_new; } +#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE) +static bool hyperthreads_present(void) { +#ifdef __linux__ + FILE *f = fopen("/sys/devices/system/cpu/cpu0/topology/thread_siblings_list", "r"); + + int c; + while ((c = fgetc(f)) != EOF && c != ','); + fclose(f); + + return c == ','; +#else + return true; // just guess +#endif +} +#endif + /** * @brief init an engine with the given number of threads, queues, and * the given policy. @@ -2153,6 +2174,42 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, for (j = maxint / i / 2; j < maxint; j += maxint / i) if (j < nr_cores && j != 0) cpuid[k++] = j; +#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE) + /* Ascending NUMA distance. Bubblesort(!) for stable equidistant CPUs. */ + if (numa_available() >= 0) { + if (nodeID == 0) message("prefer NUMA-local CPUs"); + + int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2; + bool done = false, swap_hyperthreads = hyperthreads_present(); + if (swap_hyperthreads) message("prefer physical cores to hyperthreads"); + + while (!done) { + done = true; + for (i = 1; i < nr_cores; i++) { + int node_a = numa_node_of_cpu(cpuid[i-1]); + int node_b = numa_node_of_cpu(cpuid[i]); + + /* Avoid using local hyperthreads over unused remote physical cores. + * Assume two hyperthreads, and that cpuid >= half partitions them. + */ + int thread_a = swap_hyperthreads && cpuid[i-1] >= half; + int thread_b = swap_hyperthreads && cpuid[i] >= half; + + bool swap = thread_a > thread_b; + if (thread_a == thread_b) + swap = numa_distance(home, node_a) > numa_distance(home, node_b); + + if (swap) { + int t = cpuid[i-1]; + cpuid[i-1] = cpuid[i]; + cpuid[i] = t; + done = false; + } + } + } + } +#endif + if (nodeID == 0) { #ifdef WITH_MPI message("engine_init: cpu map is [ "); diff --git a/src/kernel.h b/src/kernel.h index d55e073479d1585f3677389939fa8e7c53dbe1c9..aead6a95adc35028834d671448223a31a57fc2b6 100644 --- a/src/kernel.h +++ b/src/kernel.h @@ -516,16 +516,17 @@ __attribute__((always_inline)) INLINE static void kernel_eval(float x, #define kernel_name "Wendland C2" #define kernel_degree 5 #define kernel_ivals 1 -#define kernel_gamma 1.f -#define kernel_gamma2 1.f -#define kernel_gamma3 1.f -#define kernel_igamma 1.f +#define kernel_gamma 2.f +#define kernel_gamma2 4.f +#define kernel_gamma3 8.f +#define kernel_igamma 0.5f #define kernel_nwneigh \ (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \ 7.261825f) static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] - __attribute__((aligned(16))) = {4.0f, -15.0f, 20.0f, -10.0f, 0.0f, 1.0f, - 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + __attribute__((aligned(16))) = { + 0.05222272f, -0.39167037f, 1.04445431f, -1.04445431f, 0.f, 0.41778173f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; #define kernel_root (kernel_coeffs[kernel_degree]) #define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree]) @@ -537,7 +538,7 @@ static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] __attribute__((always_inline)) INLINE static void kernel_deval(float x, float *W, float *dW_dx) { - int ind = fminf(x, kernel_ivals); + int ind = fminf(0.5f * x, kernel_ivals); float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; float w = coeffs[0] * x + coeffs[1]; float dw_dx = coeffs[0]; @@ -563,7 +564,7 @@ __attribute__((always_inline)) int j, k; /* Load x and get the interval id. */ - ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals))); + ind.m = vec_ftoi(vec_fmin(0.5f * x->v, vec_set1((float)kernel_ivals))); /* load the coefficients. */ for (k = 0; k < VEC_SIZE; k++) @@ -590,7 +591,7 @@ __attribute__((always_inline)) __attribute__((always_inline)) INLINE static void kernel_eval(float x, float *W) { - int ind = fmin(x, kernel_ivals); + int ind = fmin(0.5f * x, kernel_ivals); float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; float w = coeffs[0] * x + coeffs[1]; for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k]; diff --git a/theory/kernel/kernels.py b/theory/kernel/kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..d7bdbe2bf9ba49a30f4c8a2ae136c4843ce5c2cf --- /dev/null +++ b/theory/kernel/kernels.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +import matplotlib +matplotlib.use("Agg") +from pylab import * +from scipy import integrate +import distinct_colours as colours +from scipy.optimize import curve_fit +from scipy.optimize import fsolve +from matplotlib.font_manager import FontProperties +import numpy + +params = { + 'axes.labelsize': 8, + 'axes.titlesize': 8, + 'font.size': 8, + 'legend.fontsize': 9, + 'xtick.labelsize': 8, + 'ytick.labelsize': 8, + 'xtick.major.pad': 2.5, + 'ytick.major.pad': 2.5, + 'text.usetex': True, +'figure.figsize' : (3.15,3.15), +'figure.subplot.left' : 0.12, +'figure.subplot.right' : 0.99 , +'figure.subplot.bottom' : 0.08 , +'figure.subplot.top' : 0.99 , +'figure.subplot.wspace' : 0. , +'figure.subplot.hspace' : 0. , +'lines.markersize' : 6, +'lines.linewidth' : 2., +'text.latex.unicode': True +} +rcParams.update(params) +rc('font',**{'family':'sans-serif','sans-serif':['Times']}) + + +#Parameters +eta = 1.2349 +h = 2.1 + +#Constants +PI = math.pi + +#Cubic Spline +cubic_kernel_degree = 3 +cubic_kernel_ivals = 2 +cubic_kernel_gamma = 2. +cubic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 6.0858 +cubic_kernel_coeffs = array([[3./(4.*PI) , -3./(2.*PI), 0., 1./PI], + [-1./(4.*PI), 3./(2.*PI), -3./PI, 2./PI], + [0., 0., 0., 0.]]) +def cubic_W(x): + if size(x) == 1: + x = array([x]) + ind = (minimum(x, cubic_kernel_ivals)).astype(int) + coeffs = cubic_kernel_coeffs[ind,:] + w = coeffs[:,0] * x + coeffs[:,1] + for k in range(2, cubic_kernel_degree+1): + w = x * w + coeffs[:,k] + return w + + +#Quartic Spline +quartic_kernel_degree = 4 +quartic_kernel_ivals = 3 +quartic_kernel_gamma = 2.5 +quartic_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 8.2293 +quartic_kernel_coeffs = array([[3./(10.*PI) , 0., -3./(4.*PI) , 0. , 23./(32.*PI)], + [-1./(5.*PI) , 1./PI , -3./(2.*PI) ,1./(4.*PI) , 11./(16.*PI)], + [1./(20.*PI) , -1./(2.*PI) , 15./(8.*PI) , -25./(8.*PI), 125./(64.*PI)], + [ 0. , 0., 0., 0., 0.]]) +def quartic_W(x): + if size(x) == 1: + x = array([x]) + ind = (minimum(x+0.5, quartic_kernel_ivals)).astype(int) + coeffs = quartic_kernel_coeffs[ind,:] + w = coeffs[:,0] * x + coeffs[:,1] + for k in range(2, quartic_kernel_degree+1): + w = x * w + coeffs[:,k] + return w + + +# Wendland kernel +wendland2_kernel_degree = 5 +wendland2_kernel_ivals = 1 +wendland2_kernel_gamma = 2 +wendland2_kernel_ngb = 4.0 / 3.0 * PI * eta**3 * 7.261825 +wendland2_kernel_coeffs = 3.342253804929802 * array([[1./8, -30./32, 80./32, -80./32., 0., 1.], + [ 0. , 0., 0., 0., 0., 0.]]) / 8. + +print wendland2_kernel_coeffs +def wendland2_W(x): + if size(x) == 1: + x = array([x]) + ind = (minimum(0.5*x, wendland2_kernel_ivals)).astype(int) + coeffs = wendland2_kernel_coeffs[ind,:] + w = coeffs[:,0] * x + coeffs[:,1] + for k in range(2, wendland2_kernel_degree+1): + w = x * w + coeffs[:,k] + return w + +#def wendland2_W(x): +# if size(x) == 1: +# x = array([x]) +# x /= 1.936492 +# x[x>1.] = 1. +# oneminusu = 1.-x +# oneminusu4 = oneminusu * oneminusu * oneminusu * oneminusu +# return 3.342253804929802 * oneminusu4 * (1. + 4.*x) / h**3 + + +#Find H +r = arange(0, 3.5*h, 1./1000.) +xi = r/h +cubic_Ws = cubic_W(xi) +quartic_Ws = quartic_W(xi) +wendland2_Ws = wendland2_W(xi) +for j in range(size(r)): + if cubic_Ws[j] == 0: + cubic_H = r[j] + break +for j in range(size(r)): + if quartic_Ws[j] == 0: + quartic_H = r[j] + break +for j in range(size(r)): + if wendland2_Ws[j] == 0: + wendland2_H = r[j] + break + + +print "H=", cubic_H +print "H=", quartic_H +print "H=", wendland2_H + + +# Compute sigma ----------------------------------------- +cubic_norm = 4.*PI*integrate.quad(lambda x: x**2*cubic_W(x), 0, cubic_H)[0] +quartic_norm = 4.*PI*integrate.quad(lambda x: x**2*quartic_W(x), 0, quartic_H)[0] +wendland2_norm = 4.*PI*integrate.quad(lambda x: x**2*wendland2_W(x), 0, wendland2_H)[0] + +print cubic_norm +print quartic_norm +print wendland2_norm + + +# Plot kernels ------------------------------------------ +r = arange(0, 3.5*h, 1./100.) +xi = r/h + +cubic_Ws = cubic_W(xi) +quartic_Ws = quartic_W(xi) +wendland2_Ws = wendland2_W(xi) + + + +figure() + +text(h-0.1, cubic_Ws[0]/20., "h", ha="right",va="center") +arrow(h, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='k', ec='k', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1) + + +plot(r,cubic_Ws, 'b-' ,label="Cubic") +plot(r, quartic_Ws, 'r-', label="Quartic") +plot(r, wendland2_Ws, 'g-', label="Wendland C2") + +text(cubic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='b') +arrow(cubic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='b', ec='b', length_includes_head=True, head_length=cubic_Ws[0]/30., head_width=0.1) + +text(quartic_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r') +arrow(quartic_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='r', ec='r', length_includes_head=True, head_length=quartic_Ws[0]/30., head_width=0.1) + +text(wendland2_H-0.1, cubic_Ws[0]/20., "H", ha="right",va="center", color='r') +arrow(wendland2_H, cubic_Ws[0]/10., 0., -cubic_Ws[0]/10., fc='g', ec='g', length_includes_head=True, head_length=wendland2_Ws[0]/30., head_width=0.1) + + +xlabel("r", labelpad=0) +ylabel("W(r,h)", labelpad=0) +legend(loc="upper right") +savefig("kernel.pdf") +