diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index 754b0d998607b1c1ebf747bcac4406f65d7e9c16..763098d75700f4767a17fa8e39f2dbbe2a6c8de2 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -129,6 +129,7 @@ Snapshots:
   delta_time: 0.01        # Time difference between consecutive outputs (in internal units)
   invoke_stf: 0           # (Optional) Call VELOCIraptor every time a snapshot is written irrespective of the VELOCIraptor output strategy.
   compression: 0          # (Optional) Set the level of compression of the HDF5 datasets [0-9]. 0 does no compression.
+  distributed: 0          # (Optional) When running over MPI, should each rank write a partial snapshot or do we want a single file? 1 implies one file per MPI rank.
   int_time_label_on:   0  # (Optional) Enable to label the snapshots using the time rounded to an integer (in internal units)
   UnitMass_in_cgs:     1  # (Optional) Unit system for the outputs (Grams)
   UnitLength_in_cgs:   1  # (Optional) Unit system for the outputs (Centimeters)
diff --git a/src/Makefile.am b/src/Makefile.am
index 04f9e7c58ed10435f7ba3f636ca841fdd81c40ab..7f958ff47c446b85a8d191f70c573a80aa7af5aa 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -43,8 +43,8 @@ endif
 # List required headers
 include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
     engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \
-    common_io.h single_io.h multipole.h map.h tools.h partition.h partition_fixed_costs.h \
-    clocks.h parser.h physical_constants.h physical_constants_cgs.h potential.h version.h \
+    common_io.h single_io.h distributed_io.h multipole.h map.h tools.h  partition_fixed_costs.h \
+    partition.h clocks.h parser.h physical_constants.h physical_constants_cgs.h potential.h version.h \
     hydro_properties.h riemann.h threadpool.h cooling_io.h cooling.h cooling_struct.h \
     statistics.h memswap.h cache.h runner_doiact_hydro_vec.h profiler.h entropy_floor.h \
     dump.h logger.h active.h timeline.h xmf.h gravity_properties.h gravity_derivatives.h \
@@ -80,7 +80,7 @@ AM_SOURCES = space.c runner_main.c runner_doiact_hydro.c runner_doiact_limiter.c
     engine_marktasks.c engine_drift.c engine_unskip.c engine_collect_end_of_step.c \
     engine_redistribute.c engine_fof.c serial_io.c timers.c debug.c scheduler.c \
     proxy.c parallel_io.c units.c common_io.c single_io.c multipole.c version.c map.c \
-    kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \
+    kernel_hydro.c tools.c part.c partition.c clocks.c parser.c distributed_io.c \
     physical_constants.c potential.c hydro_properties.c \
     threadpool.c cooling.c star_formation.c \
     statistics.c profiler.c dump.c logger.c \
diff --git a/src/engine.c b/src/engine.c
index 43f06735024ee732d6dd45bd24c70e40317bd74a..bd72e2a5a04639dca7e84b0caa64a0803c69564c 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -61,6 +61,7 @@
 #include "cosmology.h"
 #include "cycle.h"
 #include "debug.h"
+#include "distributed_io.h"
 #include "entropy_floor.h"
 #include "equation_of_state.h"
 #include "error.h"
@@ -3247,16 +3248,26 @@ void engine_dump_snapshot(struct engine *e) {
     }
   }
 
-/* Dump... */
+/* Dump (depending on the chosen strategy) ... */
 #if defined(HAVE_HDF5)
 #if defined(WITH_MPI)
+
+  if (e->snapshot_distributed) {
+
+    write_output_distributed(e, snapshotBase, e->internal_units,
+                             e->snapshot_units, e->nodeID, e->nr_nodes,
+                             MPI_COMM_WORLD, MPI_INFO_NULL);
+  } else {
+
 #if defined(HAVE_PARALLEL_HDF5)
-  write_output_parallel(e, snapshotBase, e->internal_units, e->snapshot_units,
-                        e->nodeID, e->nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
+    write_output_parallel(e, snapshotBase, e->internal_units, e->snapshot_units,
+                          e->nodeID, e->nr_nodes, MPI_COMM_WORLD,
+                          MPI_INFO_NULL);
 #else
-  write_output_serial(e, snapshotBase, e->internal_units, e->snapshot_units,
-                      e->nodeID, e->nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
+    write_output_serial(e, snapshotBase, e->internal_units, e->snapshot_units,
+                        e->nodeID, e->nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL);
 #endif
+  }
 #else
   write_output_single(e, snapshotBase, e->internal_units, e->snapshot_units);
 #endif
@@ -3449,6 +3460,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
                               engine_default_snapshot_subdir);
   e->snapshot_compression =
       parser_get_opt_param_int(params, "Snapshots:compression", 0);
+  e->snapshot_distributed =
+      parser_get_opt_param_int(params, "Snapshots:distributed", 0);
   e->snapshot_int_time_label_on =
       parser_get_opt_param_int(params, "Snapshots:int_time_label_on", 0);
   e->snapshot_invoke_stf =
diff --git a/src/engine.h b/src/engine.h
index acd839b47f3d0fa6344b0de0f493b6e67525e58c..fd8509c2b6ec397609b49f332af709bb2e455474 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -289,6 +289,7 @@ struct engine {
 
   char snapshot_base_name[PARSER_MAX_LINE_SIZE];
   char snapshot_subdir[PARSER_MAX_LINE_SIZE];
+  int snapshot_distributed;
   int snapshot_compression;
   int snapshot_int_time_label_on;
   int snapshot_invoke_stf;