diff --git a/doc/RTD/source/ParameterFiles/parameter_description.rst b/doc/RTD/source/ParameterFiles/parameter_description.rst
index 28f8b908de911f5054fb0b46a43977d0cb1eb739..87961c443f232b07b74164f103cd99000f9d7150 100644
--- a/doc/RTD/source/ParameterFiles/parameter_description.rst
+++ b/doc/RTD/source/ParameterFiles/parameter_description.rst
@@ -293,6 +293,9 @@ Particle-Mesh part of the calculation. The last five are optional:
 
 * The number cells along each axis of the mesh :math:`N`: ``mesh_side_length``,
 * Whether or not to use a distributed mesh when running over MPI: ``distributed_mesh`` (default: ``0``),
+* Whether or not to use local patches instead of direct atomic operations to
+  write to the mesh in the non-MPI case (this is a performance tuning
+  parameter): ``mesh_uses_local_patches`` (default: ``1``),
 * The mesh smoothing scale in units of the mesh cell-size :math:`a_{\rm
   smooth}`: ``a_smooth`` (default: ``1.25``),
 * The scale above which the short-range forces are assumed to be 0 (in units of
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 7fc39617f4aa85a1bfa06dbbdff47fc61f872429..ff472d4eba03c90134ef285c88283c7c827f43da 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -79,6 +79,7 @@ Stars:
 Gravity:
   mesh_side_length:              128       # Number of cells along each axis for the periodic gravity mesh (must be even).
   distributed_mesh:              0         # (Optional) Are we using a distributed mesh when running over MPI (necessary for meshes > 1290^3)
+  mesh_uses_local_patches:       1         # (Optional) Are we using thread-local patches (1) or direct atomic writes to the global mesh (0) in the non-MPI case?
   eta:                           0.025     # Constant dimensionless multiplier for time integration.
   MAC:                           adaptive  # Choice of mulitpole acceptance criterion: 'adaptive' OR 'geometric'.
   epsilon_fmm:                   0.001     # Tolerance parameter for the adaptive multipole acceptance criterion.
diff --git a/src/gravity_properties.c b/src/gravity_properties.c
index 575d21b14b1cb822ccf17143daf491f4d2e3103c..6cffb1f395730369470266267354d6edc818735c 100644
--- a/src/gravity_properties.c
+++ b/src/gravity_properties.c
@@ -64,6 +64,8 @@ void gravity_props_init(struct gravity_props *p, struct swift_params *params,
     p->distributed_mesh =
         parser_get_opt_param_int(params, "Gravity:distributed_mesh",
                                  gravity_props_default_distributed_mesh);
+    p->mesh_uses_local_patches =
+        parser_get_opt_param_int(params, "Gravity:mesh_uses_local_patches", 1);
     p->a_smooth = parser_get_opt_param_float(params, "Gravity:a_smooth",
                                              gravity_props_default_a_smooth);
     p->r_cut_max_ratio = parser_get_opt_param_float(
diff --git a/src/gravity_properties.h b/src/gravity_properties.h
index cc2ccc8d835f76c587e0ff04da06102633067c12..016c318f46e34b5e68ae7d3aa1ebbb1aab928b46 100644
--- a/src/gravity_properties.h
+++ b/src/gravity_properties.h
@@ -121,6 +121,10 @@ struct gravity_props {
   /*! Whether mesh is distributed between MPI ranks when we use MPI  */
   int distributed_mesh;
 
+  /*! Whether or not to use local patches rather than
+   * direct atomic writes to the mesh when running without MPI */
+  int mesh_uses_local_patches;
+
   /*! Mesh smoothing scale in units of top-level cell size */
   float a_smooth;
 
diff --git a/src/mesh_gravity.c b/src/mesh_gravity.c
index 7af0224519f0bc1cc1970e9fc7f489c6edb7fac2..5560b9c12987e2f3075f93cf958221f854bcb0e2 100644
--- a/src/mesh_gravity.c
+++ b/src/mesh_gravity.c
@@ -215,6 +215,7 @@ struct cic_mapper_data {
   double* rho;
   double* potential;
   int N;
+  int use_local_patches;
   double fac;
   double dim[3];
   float const_G;
@@ -260,10 +261,8 @@ void cell_gpart_to_mesh_CIC_mapper(void* map_data, int num, void* extra) {
   /* Pointer to the chunk to be processed */
   int* local_cells = (int*)map_data;
 
-  // MATTHIEU: This could in principle be improved by creating a local mesh
-  //           with just the extent required for the cell. Assignment can
-  //           then be done without atomics. That local mesh is then added
-  //           atomically to the global one.
+  /* A temporary patch of the global mesh */
+  struct pm_mesh_patch patch;
 
   /* Loop over the elements assigned to this thread */
   for (int i = 0; i < num; ++i) {
@@ -271,8 +270,26 @@ void cell_gpart_to_mesh_CIC_mapper(void* map_data, int num, void* extra) {
     /* Pointer to local cell */
     const struct cell* c = &cells[local_cells[i]];
 
-    /* Assign this cell's content to the mesh */
-    cell_gpart_to_mesh_CIC(c, rho, N, fac, dim, nu_model);
+    /* Skip empty cells */
+    if (c->grav.count == 0) continue;
+
+    if (data->use_local_patches) {
+
+      /* Do a CIC interpolation of all the particles in this cell onto
+         the local patch (allocates memory in the patch) */
+      accumulate_cell_to_local_patch(N, fac, dim, c, &patch, nu_model);
+
+      /* Copy the local patch values back onto the global mesh */
+      pm_add_patch_to_global_mesh(rho, &patch);
+
+      /* Free the allocated memory */
+      pm_mesh_patch_clean(&patch);
+
+    } else {
+
+      /* Assign this cell's content directly atomically to the mesh */
+      cell_gpart_to_mesh_CIC(c, rho, N, fac, dim, nu_model);
+    }
   }
 }
 
@@ -880,6 +897,7 @@ void compute_potential_global(struct pm_mesh* mesh, const struct space* s,
   data.rho = rho;
   data.potential = NULL;
   data.N = N;
+  data.use_local_patches = mesh->use_local_patches;
   data.fac = cell_fac;
   data.dim[0] = dim[0];
   data.dim[1] = dim[1];
@@ -1136,6 +1154,7 @@ void pm_mesh_init(struct pm_mesh* mesh, const struct gravity_props* props,
   mesh->periodic = 1;
   mesh->N = N;
   mesh->distributed_mesh = props->distributed_mesh;
+  mesh->use_local_patches = props->mesh_uses_local_patches;
   mesh->dim[0] = dim[0];
   mesh->dim[1] = dim[1];
   mesh->dim[2] = dim[2];
diff --git a/src/mesh_gravity.h b/src/mesh_gravity.h
index 4f968d09dd8c53bab5823f01cddfcc30f4666325..5b92d6e1c0bf10455a182253788cdd3aae638ba5 100644
--- a/src/mesh_gravity.h
+++ b/src/mesh_gravity.h
@@ -50,6 +50,10 @@ struct pm_mesh {
   /*! Whether mesh is distributed between MPI ranks */
   int distributed_mesh;
 
+  /*! Whether or not to use local patches rather than
+   * direct atomic writes to the mesh when running without MPI */
+  int use_local_patches;
+
   /*! Integer time-step end of the mesh force for the last step */
   integertime_t ti_end_mesh_last;
 
diff --git a/src/mesh_gravity_mpi.c b/src/mesh_gravity_mpi.c
index f493eae2ac081825accbe19d122e1e31c3e34664..2401b5ac4dab2030864a36305c3f9b21205012f8 100644
--- a/src/mesh_gravity_mpi.c
+++ b/src/mesh_gravity_mpi.c
@@ -161,6 +161,9 @@ void accumulate_cell_to_local_patches_mapper(void *map_data, int num,
     /* Pointer to local cell */
     const struct cell *c = &cells[local_cells[i]];
 
+    /* Skip empty cells */
+    if (c->grav.count == 0) continue;
+
     /* Assign this cell's content to the mesh */
     accumulate_cell_to_local_patch(N, fac, dim, c, &local_patches[i], nu_model);
   }
diff --git a/src/mesh_gravity_mpi.h b/src/mesh_gravity_mpi.h
index 4840a6d379f71ee91a12c8738e04401265e16806..11e2f9a3a9ad9c19d6defec939a7b0aee9a39f88 100644
--- a/src/mesh_gravity_mpi.h
+++ b/src/mesh_gravity_mpi.h
@@ -28,6 +28,12 @@ struct cell;
 struct threadpool;
 struct pm_mesh;
 struct pm_mesh_patch;
+struct neutrino_model;
+
+void accumulate_cell_to_local_patch(const int N, const double fac,
+                                    const double *dim, const struct cell *cell,
+                                    struct pm_mesh_patch *patch,
+                                    const struct neutrino_model *nu_model);
 
 void mpi_mesh_accumulate_gparts_to_local_patches(
     struct threadpool *tp, const int N, const double fac, const struct space *s,
diff --git a/src/mesh_gravity_patch.c b/src/mesh_gravity_patch.c
index c26153f7f4f06e30f9354d235ab5a8085f78d1b1..53036581ca35d4a631904bb6e25f0c4161ee6936 100644
--- a/src/mesh_gravity_patch.c
+++ b/src/mesh_gravity_patch.c
@@ -26,6 +26,7 @@
 #include "mesh_gravity_patch.h"
 
 /* Local includes. */
+#include "atomic.h"
 #include "cell.h"
 #include "error.h"
 #include "row_major_id.h"
@@ -99,6 +100,45 @@ void pm_mesh_patch_init(struct pm_mesh_patch *patch, const struct cell *cell,
     error("Failed to allocate array for mesh patch!");
 }
 
+/**
+ * @brief Write the content of a mesh patch back to the global mesh
+ * using atomic operations.
+ *
+ * @param global_mesh The global mesh to write to.
+ * @param patch The #pm_mesh_patch object to write from.
+ */
+void pm_add_patch_to_global_mesh(double *const global_mesh,
+                                 const struct pm_mesh_patch *patch) {
+
+  const int N = patch->N;
+  const int size_i = patch->mesh_size[0];
+  const int size_j = patch->mesh_size[1];
+  const int size_k = patch->mesh_size[2];
+  const int mesh_min_i = patch->mesh_min[0];
+  const int mesh_min_j = patch->mesh_min[1];
+  const int mesh_min_k = patch->mesh_min[2];
+
+  /* Remind the compiler that the arrays are nicely aligned */
+  swift_declare_aligned_ptr(const double, mesh, patch->mesh,
+                            SWIFT_CACHE_ALIGNMENT);
+
+  for (int i = 0; i < size_i; ++i) {
+    for (int j = 0; j < size_j; ++j) {
+      for (int k = 0; k < size_k; ++k) {
+
+        const int ii = i + mesh_min_i;
+        const int jj = j + mesh_min_j;
+        const int kk = k + mesh_min_k;
+
+        const int patch_index = pm_mesh_patch_index(patch, i, j, k);
+        const int mesh_index = row_major_id_periodic(ii, jj, kk, N);
+
+        atomic_add_d(&global_mesh[mesh_index], mesh[patch_index]);
+      }
+    }
+  }
+}
+
 /**
  * @brief Set all values in a mesh patch to zero
  *
@@ -106,9 +146,12 @@ void pm_mesh_patch_init(struct pm_mesh_patch *patch, const struct cell *cell,
  */
 void pm_mesh_patch_zero(struct pm_mesh_patch *patch) {
 
+  /* Remind the compiler that the arrays are nicely aligned */
+  swift_declare_aligned_ptr(double, mesh, patch->mesh, SWIFT_CACHE_ALIGNMENT);
+
   const int num =
       patch->mesh_size[0] * patch->mesh_size[1] * patch->mesh_size[2];
-  memset(patch->mesh, 0, num * sizeof(double));
+  memset(mesh, 0, num * sizeof(double));
 }
 
 /**
@@ -118,7 +161,7 @@ void pm_mesh_patch_zero(struct pm_mesh_patch *patch) {
  */
 void pm_mesh_patch_clean(struct pm_mesh_patch *patch) {
 
-  swift_free("mesh_patch", patch->mesh);
+  if (patch->mesh) swift_free("mesh_patch", patch->mesh);
 
   /* Zero everything and give a silly mesh size to help debugging */
   memset(patch, 0, sizeof(struct pm_mesh_patch));
diff --git a/src/mesh_gravity_patch.h b/src/mesh_gravity_patch.h
index 067dc2cf5e0fbd289534c5fe1afeb65a5f413e00..cc82175882159e3a29fc523a008da44c27fc2d3c 100644
--- a/src/mesh_gravity_patch.h
+++ b/src/mesh_gravity_patch.h
@@ -167,4 +167,7 @@ __attribute__((always_inline)) INLINE static void pm_mesh_patch_CIC_set(
   mesh[pm_mesh_patch_index(patch, i + 1, j + 1, k + 1)] += value * dx * dy * dz;
 }
 
+void pm_add_patch_to_global_mesh(double *const global_mesh,
+                                 const struct pm_mesh_patch *patch);
+
 #endif