diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h
index d974c3589964f7ddd5846216aaa9b8c5784601ae..585655940cf6f0587fd7dd1efae1bf546476b1da 100644
--- a/src/kernel_hydro.h
+++ b/src/kernel_hydro.h
@@ -304,6 +304,40 @@ __attribute__((always_inline)) INLINE static void kernel_eval(
   *W = w * kernel_constant * kernel_gamma_inv_dim;
 }
 
+/**
+ * @brief Computes the kernel function derivative.
+ *
+ * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
+ * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
+ *
+ * Returns 0 if \f$u > \gamma = H/h\f$.
+ *
+ * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
+ * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
+    float u, float *restrict dW_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  const float x = u * kernel_gamma_inv;
+
+  /* Pick the correct branch of the kernel */
+  const int temp = (int)(x * kernel_ivals_f);
+  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
+  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
+
+  /* First two terms of the polynomial ... */
+  float dw_dx = ((float)kernel_degree * coeffs[0] * x) + (float)(kernel_degree - 1) * coeffs[1];
+
+  /* ... and the rest of them */
+  for (int k = 2; k < kernel_degree; k++) {
+    dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
+  }
+
+  /* Return everything */
+  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
+}
+
 /* ------------------------------------------------------------------------- */
 
 #ifdef WITH_VECTORIZATION
@@ -370,24 +404,35 @@ static const vector wendland_const_c2 = FILL_VEC(20.f);
 static const vector wendland_const_c3 = FILL_VEC(-10.f);
 static const vector wendland_const_c4 = FILL_VEC(0.f);
 static const vector wendland_const_c5 = FILL_VEC(1.f);
+
+static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f);
+static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f);
+static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f);
+static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f);
 #elif defined(CUBIC_SPLINE_KERNEL)
 /* First region 0 < u < 0.5 */
 static const vector cubic_1_const_c0 = FILL_VEC(3.f);
 static const vector cubic_1_const_c1 = FILL_VEC(-3.f);
 static const vector cubic_1_const_c2 = FILL_VEC(0.f);
 static const vector cubic_1_const_c3 = FILL_VEC(0.5f);
+static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f);
+static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f);
+static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f);
 
 /* Second region 0.5 <= u < 1 */
 static const vector cubic_2_const_c0 = FILL_VEC(-1.f);
 static const vector cubic_2_const_c1 = FILL_VEC(3.f);
 static const vector cubic_2_const_c2 = FILL_VEC(-3.f);
 static const vector cubic_2_const_c3 = FILL_VEC(1.f);
+static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f);
+static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f);
+static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
 static const vector cond = FILL_VEC(0.5f);
 #endif
 
 /**
  * @brief Computes the kernel function and its derivative for two particles
- * using interleaved vectors.
+ * using vectors.
  *
  * Return 0 if $u > \\gamma = H/h$
  *
@@ -415,7 +460,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
-  w->v = vec_fma(x.v, w->v, wendland_const_c4.v);
+  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
@@ -438,7 +483,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
   /* Calculate the polynomial interleaving vector operations. */
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
-  w->v = vec_fma(x.v, w->v, cubic_1_const_c2.v);
+  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
   w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
   
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
@@ -511,8 +556,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
-  w->v = vec_fma(x.v, w->v, wendland_const_c4.v);
-  w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v);
+  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
+  w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
 
   dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
@@ -556,8 +601,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
   dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
   dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
   dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
-  w->v = vec_fma(x.v, w->v, cubic_1_const_c2.v);
-  w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c2.v);
+  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
+  w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
   w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
   w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
 
@@ -597,8 +642,129 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
 #endif
 }
 
+/**
+ * @brief Computes the kernel function for two particles
+ * using vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param w (return) The value of the kernel function $W(x,h)$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
+    vector *u, vector *w) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
+  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
+  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/
+  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector w2;
+  vector mask1, mask2;
+
+  /* Form a mask for each part of the kernel. */
+  mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
+
+  /* Work out w for both regions of the kernel and combine the results together using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
+  w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
+  
+  /* Calculate the polynomial interleaving vector operations. */
+  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
+  
+  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
+  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
+  
+  /* Mask out unneeded values. */
+  w->v = vec_and(w->v,mask1.v);
+  w2.v = vec_and(w2.v,mask2.v);
+
+  /* Added both w and w2 together to form complete result. */
+  w->v = vec_add(w->v,w2.v);
+#else
+#error
 #endif
 
+  /* Return everything */
+  w->v =
+      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
+}
+
+/**
+ * @brief Computes the kernel function derivative for two particles
+ * using vectors.
+ *
+ * Return 0 if $u > \\gamma = H/h$
+ *
+ * @param u The ratio of the distance to the smoothing length $u = x/h$.
+ * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
+ */
+__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
+    vector *u, vector *dw_dx) {
+
+  /* Go to the range [0,1[ from [0,H[ */
+  vector x;
+  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
+
+#ifdef WENDLAND_C2_KERNEL
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v,x.v,wendland_dwdx_const_c1.v);
+
+  /* Calculate the polynomial interleaving vector operations */
+  dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c2.v);
+
+  dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c3.v);
+
+  dw_dx->v = vec_mul(dw_dx->v,x.v);
+
+#elif defined(CUBIC_SPLINE_KERNEL)
+  vector dw_dx2;
+  vector mask1, mask2;
+
+  /* Form a mask for each part of the kernel. */
+  mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
+  mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
+
+  /* Work out w for both regions of the kernel and combine the results together using masks. */
+
+  /* Init the iteration for Horner's scheme. */
+  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v,x.v,cubic_1_dwdx_const_c1.v);
+  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v,x.v,cubic_2_dwdx_const_c1.v);
+  
+  /* Calculate the polynomial interleaving vector operations. */
+  dw_dx->v = vec_mul(dw_dx->v,x.v); /* cubic_1_dwdx_const_c2 is zero. */
+  dw_dx2.v = vec_fma(dw_dx2.v,x.v,cubic_2_dwdx_const_c2.v);
+  
+  /* Mask out unneeded values. */
+  dw_dx->v = vec_and(dw_dx->v,mask1.v);
+  dw_dx2.v = vec_and(dw_dx2.v,mask2.v);
+
+  /* Added both dwdx and dwdx2 together to form complete result. */
+  dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
+#else
+#error
+#endif
+
+  /* Return everything */
+  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
+                                       kernel_gamma_inv_dim_plus_one_vec.v));
+}
+
+#endif /* WITH_VECTORIZATION */
+
 /* Some cross-check functions */
 void hydro_kernel_dump(int N);