diff --git a/src/io/subhalo_io.cpp b/src/io/subhalo_io.cpp
index b239d3c3..f630e37a 100644
--- a/src/io/subhalo_io.cpp
+++ b/src/io/subhalo_io.cpp
@@ -8,6 +8,8 @@
 #include "../snapshot_number.h"
 #include "../subhalo.h"
 #include "../config_parser.h"
+#include "../task_limited_section.h"
+
 #include "git_version_info.h"
 
 void SubhaloSnapshot_t::BuildHDFDataType()
@@ -276,20 +278,14 @@ void SubhaloSnapshot_t::Save(MpiWorker_t &world)
   string subdir = GetSubDir();
   mkdir(subdir.c_str(), 0755);
 
-  /* Decide how many ranks per node write simultaneously */
-  int nr_nodes = (world.size() / world.MaxNodeSize);
-  int nr_writing = HBTConfig.MaxConcurrentIO / nr_nodes;
-  if (nr_writing < 1)
-    nr_writing = 1; // Always at least one per node
-
   /* Subhalo properties and bound particle lists. */
-  WriteBoundFiles(world, nr_writing);
+  WriteBoundFiles(world);
 
   /* Particles associated to each subhalo. Used for debugging and restarting. */
-  WriteSourceFiles(world, nr_writing);
+  WriteSourceFiles(world);
 }
 
-void SubhaloSnapshot_t::WriteBoundFiles(MpiWorker_t &world, const int &number_ranks_writing)
+void SubhaloSnapshot_t::WriteBoundFiles(MpiWorker_t &world)
 {
   /* Number of total subhalo entries */
   HBTInt NumSubsAll = 0, NumSubs = Subhalos.size();
@@ -299,39 +295,19 @@ void SubhaloSnapshot_t::WriteBoundFiles(MpiWorker_t &world, const int &number_ra
     cout << "saving " << NumSubsAll << " subhalos to " << GetSubDir() << endl;
 
   /* Allow a limited number of ranks per node to write simultaneously */
-  int writes_done = 0;
-  for (int rank_within_node = 0; rank_within_node < world.MaxNodeSize; rank_within_node += 1)
-  {
-    if (rank_within_node == world.NodeRank)
-    {
-      WriteBoundSubfile(world.rank(), world.size(), NumSubsAll);
-      writes_done += 1;
-    }
-    if (rank_within_node % number_ranks_writing == number_ranks_writing - 1)
-      MPI_Barrier(world.Communicator);
-  }
-
-  /* Every rank should have executed the writing code exactly once */
-  assert(writes_done == 1);
+  TaskLimitedSection section(MPI_COMM_WORLD, HBTConfig.MaxConcurrentIO);
+  section.start();
+  WriteBoundSubfile(world.rank(), world.size(), NumSubsAll);
+  section.end();
 }
 
-void SubhaloSnapshot_t::WriteSourceFiles(MpiWorker_t &world, const int &number_ranks_writing)
+void SubhaloSnapshot_t::WriteSourceFiles(MpiWorker_t &world)
 {
   /* Allow a limited number of ranks per node to write simultaneously */
-  int writes_done = 0;
-  for (int rank_within_node = 0; rank_within_node < world.MaxNodeSize; rank_within_node += 1)
-  {
-    if (rank_within_node == world.NodeRank)
-    {
-      WriteSourceSubfile(world.rank(), world.size());
-      writes_done += 1;
-    }
-    if (rank_within_node % number_ranks_writing == number_ranks_writing - 1)
-      MPI_Barrier(world.Communicator);
-  }
-
-  /* Every rank should have executed the writing code exactly once */
-  assert(writes_done == 1);
+  TaskLimitedSection section(MPI_COMM_WORLD, HBTConfig.MaxConcurrentIO);
+  section.start();
+  WriteSourceSubfile(world.rank(), world.size());
+  section.end();
 }
 
 void SubhaloSnapshot_t::WriteBoundSubfile(int iFile, int nfiles, HBTInt NumSubsAll)
diff --git a/src/io/swiftsim_io.cpp b/src/io/swiftsim_io.cpp
index 13dd5984..1130e07a 100644
--- a/src/io/swiftsim_io.cpp
+++ b/src/io/swiftsim_io.cpp
@@ -16,6 +16,7 @@ using namespace std;
 #include "../hdf_wrapper.h"
 #include "../mymath.h"
 #include "../snapshot.h"
+#include "../task_limited_section.h"
 #include "swiftsim_io.h"
 #include "exchange_and_merge.h"
 
@@ -639,14 +640,6 @@ void SwiftSimReader_t::LoadSnapshot(MpiWorker_t &world, int snapshotId, vector<P
                                     Cosmology_t &Cosmology)
 {
 
-  MPI_Barrier(world.Communicator);
-
-  // Decide how many ranks per node read simultaneously
-  int nr_nodes = (world.size() / world.MaxNodeSize);
-  int nr_reading = HBTConfig.MaxConcurrentIO / nr_nodes;
-  if (nr_reading < 1)
-    nr_reading = 1; // Always at least one per node
-
   SetSnapshot(snapshotId);
 
   const int root = 0;
@@ -711,33 +704,29 @@ void SwiftSimReader_t::LoadSnapshot(MpiWorker_t &world, int snapshotId, vector<P
   // Allocate storage for the particles
   Particles.resize(np_local);
 
-  // Allow a limited number of ranks per node to read simultaneously
-  int reads_done = 0;
-  for (int rank_within_node = 0; rank_within_node < world.MaxNodeSize; rank_within_node += 1)
-  {
-    if (rank_within_node == world.NodeRank)
+  // Limit number of ranks doing I/O at once
+  TaskLimitedSection section(MPI_COMM_WORLD, HBTConfig.MaxConcurrentIO);
+  section.start();
+  
+  // Loop over all files
+  HBTInt particle_offset = 0;
+  for (int file_nr = 0; file_nr < Header.NumberOfFiles; file_nr += 1)
     {
-
-      // Loop over all files
-      HBTInt particle_offset = 0;
-      for (int file_nr = 0; file_nr < Header.NumberOfFiles; file_nr += 1)
-      {
-
-        // Determine global offset of first particle to read from this file:
-        // This is the larger of the offset of the first particle in the file
-        // and the offset of the first particle this rank is to read.
-        HBTInt i1 = offset_file[file_nr];
-        if (local_first_offset > i1)
-          i1 = local_first_offset;
-
-        // Determine global offset of last particle to read from this file:
-        // This is the smaller of the offset to the last particle in this file
-        // and the offset of the last particle this rank is to read.
-        HBTInt i2 = offset_file[file_nr] + np_file[file_nr] - 1;
-        if (local_last_offset < i2)
-          i2 = local_last_offset;
-
-        if (i2 >= i1)
+      // Determine global offset of first particle to read from this file:
+      // This is the larger of the offset of the first particle in the file
+      // and the offset of the first particle this rank is to read.
+      HBTInt i1 = offset_file[file_nr];
+      if (local_first_offset > i1)
+        i1 = local_first_offset;
+      
+      // Determine global offset of last particle to read from this file:
+      // This is the smaller of the offset to the last particle in this file
+      // and the offset of the last particle this rank is to read.
+      HBTInt i2 = offset_file[file_nr] + np_file[file_nr] - 1;
+      if (local_last_offset < i2)
+        i2 = local_last_offset;
+      
+      if (i2 >= i1)
         {
           // We have particles to read from this file.
           HBTInt file_start = i1 - offset_file[file_nr]; // Offset to first particle to read
@@ -748,16 +737,9 @@ void SwiftSimReader_t::LoadSnapshot(MpiWorker_t &world, int snapshotId, vector<P
           ReadSnapshot(file_nr, Particles.data() + particle_offset, file_start, file_count);
           particle_offset += file_count;
         }
-      }                                    // Next file
-      assert(particle_offset == np_local); // Check we read the expected number of particles
-      reads_done += 1;
-    }
-    if (rank_within_node % nr_reading == nr_reading - 1)
-      MPI_Barrier(world.Communicator);
-  } // Next MPI rank within the node
-
-  // Every rank should have executed the reading code exactly once
-  assert(reads_done == 1);
+    } // Next file
+  assert(particle_offset == np_local); // Check we read the expected number of particles
+  section.end();
 
   global_timer.Tick("snap_io", world.Communicator);
 
@@ -818,12 +800,6 @@ void SwiftSimReader_t::LoadGroups(MpiWorker_t &world, int snapshotId, vector<Hal
 { // read in particle properties at the same time, to avoid particle look-up at later stage.
   SetSnapshot(snapshotId);
 
-  // Decide how many ranks per node read simultaneously
-  int nr_nodes = (world.size() / world.MaxNodeSize);
-  int nr_reading = HBTConfig.MaxConcurrentIO / nr_nodes;
-  if (nr_reading < 1)
-    nr_reading = 1; // Always at least one per node
-
   const int root = 0;
   if (world.rank() == root)
   {
@@ -859,33 +835,30 @@ void SwiftSimReader_t::LoadGroups(MpiWorker_t &world, int snapshotId, vector<Hal
 
   bool FlagReadId = true; //! HBTConfig.GroupLoadedIndex;
 
-  // Allow a limited number of ranks per node to read simultaneously
-  int reads_done = 0;
-  for (int rank_within_node = 0; rank_within_node < world.MaxNodeSize; rank_within_node += 1)
-  {
-    if (rank_within_node == world.NodeRank)
+  // Limit number of ranks doing I/O at once
+  TaskLimitedSection section(MPI_COMM_WORLD, HBTConfig.MaxConcurrentIO);
+  section.start();
+  
+  // Loop over all files
+  HBTInt particle_offset = 0;
+  for (int file_nr = 0; file_nr < Header.NumberOfFiles; file_nr += 1)
     {
 
-      // Loop over all files
-      HBTInt particle_offset = 0;
-      for (int file_nr = 0; file_nr < Header.NumberOfFiles; file_nr += 1)
-      {
-
-        // Determine global offset of first particle to read from this file:
-        // This is the larger of the offset of the first particle in the file
-        // and the offset of the first particle this rank is to read.
-        HBTInt i1 = offset_file[file_nr];
-        if (local_first_offset > i1)
-          i1 = local_first_offset;
-
-        // Determine global offset of last particle to read from this file:
-        // This is the smaller of the offset to the last particle in this file
-        // and the offset of the last particle this rank is to read.
-        HBTInt i2 = offset_file[file_nr] + np_file[file_nr] - 1;
-        if (local_last_offset < i2)
-          i2 = local_last_offset;
-
-        if (i2 >= i1)
+      // Determine global offset of first particle to read from this file:
+      // This is the larger of the offset of the first particle in the file
+      // and the offset of the first particle this rank is to read.
+      HBTInt i1 = offset_file[file_nr];
+      if (local_first_offset > i1)
+        i1 = local_first_offset;
+
+      // Determine global offset of last particle to read from this file:
+      // This is the smaller of the offset to the last particle in this file
+      // and the offset of the last particle this rank is to read.
+      HBTInt i2 = offset_file[file_nr] + np_file[file_nr] - 1;
+      if (local_last_offset < i2)
+        i2 = local_last_offset;
+      
+      if (i2 >= i1)
         {
           // We have particles to read from this file.
           HBTInt file_start = i1 - offset_file[file_nr]; // Offset to first particle to read
@@ -896,17 +869,10 @@ void SwiftSimReader_t::LoadGroups(MpiWorker_t &world, int snapshotId, vector<Hal
           ReadGroupParticles(file_nr, ParticleHosts.data() + particle_offset, file_start, file_count, FlagReadId);
           particle_offset += file_count;
         }
-      }                                    // Next file
-      assert(particle_offset == np_local); // Check we read the expected number of particles
-      reads_done += 1;
-    }
-    if (rank_within_node % nr_reading == nr_reading - 1)
-      MPI_Barrier(world.Communicator);
-  } // Next MPI rank within the node
-
-  // Every rank should have executed the reading code exactly once
-  assert(reads_done == 1);
-
+    } // Next file
+      
+  assert(particle_offset == np_local); // Check we read the expected number of particles
+  section.end();
   global_timer.Tick("halo_io", world.Communicator);
 
   // #define HALO_IO_TEST
diff --git a/src/subhalo.h b/src/subhalo.h
index f094ea1a..7a45fed5 100644
--- a/src/subhalo.h
+++ b/src/subhalo.h
@@ -263,8 +263,8 @@ class SubhaloSnapshot_t : public Snapshot_t
 
   /* I/O methods */
   void ReadFile(int iFile, const SubReaderDepth_t depth);
-  void WriteBoundFiles(MpiWorker_t &world, const int &number_ranks_writing);
-  void WriteSourceFiles(MpiWorker_t &world, const int &number_ranks_writing);
+  void WriteBoundFiles(MpiWorker_t &world);
+  void WriteSourceFiles(MpiWorker_t &world);
   void WriteBoundSubfile(int iFile, int nfiles, HBTInt NumSubsAll);
   void WriteSourceSubfile(int iFile, int nfiles);
 
diff --git a/src/task_limited_section.h b/src/task_limited_section.h
new file mode 100644
index 00000000..3229cfd5
--- /dev/null
+++ b/src/task_limited_section.h
@@ -0,0 +1,178 @@
+#include <mpi.h>
+#include <assert.h>
+#include <cstdlib>
+
+/*
+  Class to limit the number of MPI ranks executing a piece of code
+  simultaneously. Used to implement the MaxConcurrentIO option.
+
+  Example usage:
+
+  TaskLimitedSection section(MPI_COMM_WORLD, HBTConfig.MaxConcurrentIO);
+  section.start();
+  ...
+  (I/O code goes here!)
+  ...
+  section.end();
+
+*/
+class TaskLimitedSection {
+
+private:
+
+  int max_nr_tasks;
+  MPI_Comm comm;
+  MPI_Win win;
+  int *buffer;
+  int controller_rank;
+  MPI_Request controller_rank_request;
+  int order;
+  
+  const int CONTROLLER_RANK_TAG = 0;
+  const int GO_TAG = 1;
+  const int COMPLETION_TAG = 2;
+
+public:
+
+  TaskLimitedSection(MPI_Comm comm, const int max_nr_tasks) {
+
+    int comm_size;
+    MPI_Comm_size(comm, &comm_size);
+    int comm_rank;
+    MPI_Comm_rank(comm, &comm_rank);
+
+    // Renumber ranks so we're not just allowing the first N to run initially -
+    // ideally we want to have the active ranks spread over all compute nodes.
+    int block_size = max_nr_tasks;
+    int position_in_block = comm_rank % block_size;
+    int block_index = comm_rank / block_size;
+    int nr_blocks = comm_size / max_nr_tasks;
+    if(comm_size % max_nr_tasks != 0)nr_blocks += 1;
+    assert(block_size*block_index+position_in_block == comm_rank);
+    order = position_in_block * nr_blocks + block_index;
+
+    // Create the reordered communicator
+    MPI_Comm_split(comm, 0, order, &(this->comm));
+    this->max_nr_tasks = max_nr_tasks;
+  }
+
+  ~TaskLimitedSection() {
+    MPI_Comm_free(&comm);
+  }
+  
+  void start() {
+  
+    /* Get rank and number of ranks */
+    int comm_size;
+    MPI_Comm_size(comm, &comm_size);
+    int comm_rank;
+    MPI_Comm_rank(comm, &comm_rank);
+
+    /* If all ranks are allowed to run there's nothing to do */
+    if(max_nr_tasks >= comm_size)return;
+
+    /* Allocate and init counter for RMA */
+    MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &buffer);
+    *buffer = 0;
+    MPI_Win_create(buffer, sizeof(int), sizeof(int), MPI_INFO_NULL, comm, &win);
+
+    /* Post a receive to get controller task's rank (will be first rank to finish) */
+    MPI_Irecv(&controller_rank, 1, MPI_INT, MPI_ANY_SOURCE,
+              CONTROLLER_RANK_TAG, comm, &controller_rank_request);
+    
+    /* The first max_nr_tasks ranks can proceed immediately */
+    if(comm_rank < max_nr_tasks)return;
+
+    /* Others need to wait for a message to proceed */
+    int go;
+    MPI_Recv(&go, 1, MPI_INT, MPI_ANY_SOURCE, GO_TAG, comm, MPI_STATUS_IGNORE);
+    
+  }
+  
+  void end() {
+
+    /* Get rank and number of ranks */
+    int comm_size;
+    MPI_Comm_size(comm, &comm_size);
+    int comm_rank;
+    MPI_Comm_rank(comm, &comm_rank);
+
+    /* If all ranks are allowed to run there's nothing to do */
+    if(max_nr_tasks >= comm_size)return;
+
+    MPI_Request *request = (MPI_Request *) malloc(sizeof(MPI_Request)*comm_size);
+  
+    /*
+      Check if we're the first task to reach the end of the section:
+      We do this by doing an atomic fetch and increment on the count of
+      the number of ranks that have finished. If the count is zero we're
+      the first and will become responsible for signalling other ranks
+      to proceed.
+      
+      We only need to check the completion count for the first max_nr_tasks
+      ranks, because others can't start until another rank finishes so they
+      can't be first to finish.
+    */
+    int completion_count = 0;
+    if(comm_rank < max_nr_tasks) {
+      /* We're one of the ranks that started immediately, so we might be first
+         to complete */
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, comm_size-1, 0, win);
+      int to_add = 1;
+      MPI_Get_accumulate(&to_add, 1, MPI_INT,
+                         &completion_count, 1, MPI_INT,
+                         comm_size-1, 0, 1, MPI_INT, MPI_SUM, win);
+      MPI_Win_unlock(comm_size-1, win);
+    } else {
+      /* We aren't in the initial batch of max_nr_tasks so we can't be first to complete.
+         Skip the get_accumulate so we're not waiting for the last rank to respond
+         when it might be busy in non-MPI code. */
+      completion_count = 1;
+    }
+    if(completion_count == 0) {
+      
+      /* This task is the first to reach the end of the section, so tell everyone */
+      for(int dest=0; dest<comm_size; dest+=1)
+        MPI_Isend(&comm_rank, 1, MPI_INT, dest, CONTROLLER_RANK_TAG, comm, request+dest);
+
+      /* Then we need to wait for others to finish and send go signals as necessary */
+      int nr_left = comm_size - max_nr_tasks;
+      for(int i=0; i<comm_size; i+=1) {
+        
+        /* Wait for a completion message (but not on first iteration, because we don't send to self) */
+        if(i > 0) {
+          int done;
+          MPI_Status status;
+          MPI_Recv(&done, 1, MPI_INT, MPI_ANY_SOURCE, COMPLETION_TAG, comm, &status);
+        }
+        
+        /* If there are tasks still waiting, send the next go signal */
+        if(nr_left > 0) {
+          int dest = comm_size - nr_left;
+          int go = 1;
+          MPI_Send(&go, 1, MPI_INT, dest, GO_TAG, comm);
+          nr_left -= 1;
+        }
+      }
+    }
+
+    /* Make sure we've received the controller rank */
+    MPI_Wait(&(controller_rank_request), MPI_STATUS_IGNORE);
+  
+    /* Send completion message if we're not the controller */
+    if(completion_count > 0) {
+      int complete = 1;
+      MPI_Send(&complete, 1, MPI_INT, controller_rank, COMPLETION_TAG, comm);
+    }
+
+    /* Make sure all sends from the controller completed */
+    if(completion_count==0)
+      MPI_Waitall(comm_size, request, MPI_STATUSES_IGNORE);
+  
+    /* Tidy up */
+    free(request);
+    MPI_Win_free(&win);
+    MPI_Free_mem(buffer);
+  }
+  
+};
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index e4cb1daf..fa863053 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -52,6 +52,7 @@ foreach(TEST_NAME
     test_locate_ids_random
     test_mergertree
     test_myalltoall
+    test_limited_section
   )
 
   add_executable(${TEST_NAME} ${TEST_NAME}.cpp $<TARGET_OBJECTS:hbtfuncs> $<TARGET_OBJECTS:testfuncs>)
diff --git a/unit_tests/test_limited_section.cpp b/unit_tests/test_limited_section.cpp
new file mode 100644
index 00000000..19f1093a
--- /dev/null
+++ b/unit_tests/test_limited_section.cpp
@@ -0,0 +1,97 @@
+#include <mpi.h>
+#include <time.h>
+#include <errno.h>
+#include <iostream>
+
+#include "task_limited_section.h"
+#include "verify.h"
+
+/*
+  Test code to limit number of tasks executing simultaneously.
+*/
+int main(int argc, char *argv[])
+{
+
+  MPI_Init(&argc, &argv);
+  int comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
+  int comm_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+  /* Skip this test if we're running on only one MPI rank */
+  if(comm_size==1)return 0;
+
+  /* Set up count of number of tasks executing */
+  int *count;
+  MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &count);
+  *count = 0;
+  MPI_Win win;
+  MPI_Win_create(count, sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+
+  /* Split off one rank to maintain a count of tasks currently executing */
+  int color = (comm_rank==0) ? 0 : 1;
+  int key = comm_rank;
+  MPI_Comm split_comm;
+  MPI_Comm_split(MPI_COMM_WORLD, color, key, &split_comm);
+
+  if(color == 1) {
+    int split_comm_size = comm_size - 1;
+    for(int max_nr_tasks = 1; max_nr_tasks <= split_comm_size; max_nr_tasks += 1) {
+
+      TaskLimitedSection section(split_comm, max_nr_tasks);
+      section.start();
+
+      /* On starting, increment the counter */
+      int start_count = -1;
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+      int to_add = 1;
+      MPI_Get_accumulate(&to_add, 1, MPI_INT,
+                         &start_count, 1, MPI_INT,
+                         0, 0, 1, MPI_INT, MPI_SUM, win);
+      MPI_Win_unlock(0, win);
+      /* When we start, should have 0 to max_nr_tasks-1 other tasks running */
+      verify(start_count >= 0);
+      verify(start_count < max_nr_tasks);
+      
+      /* Sleep for a bit */
+      struct timespec ts;
+      ts.tv_sec = 0;
+      ts.tv_nsec = 100 * 1000000; // 100 millisec
+      int res;
+      do {
+        res = nanosleep(&ts, &ts);
+      } while (res && errno == EINTR);
+
+      /* On finishing, decrement the counter */
+      int end_count = -1;
+      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, win);
+      to_add = -1;
+      MPI_Get_accumulate(&to_add, 1, MPI_INT,
+                         &end_count, 1, MPI_INT,
+                         0, 0, 1, MPI_INT, MPI_SUM, win);
+      MPI_Win_unlock(0, win);
+      /* When we finish, should have 1 to max_nr_tasks tasks running (including our self) */
+      verify(end_count > 0);
+      verify(end_count <= max_nr_tasks);
+
+      section.end();
+
+      // Report maximum counts:
+      // We should usually have start_count_max=max_nr_tasks-1 and end_count_max=max_nr_tasks,
+      // although this is not guaranteed (e.g. if system is busy and some tasks are delayed).
+      int start_count_max;
+      MPI_Allreduce(&start_count, &start_count_max, 1, MPI_INT, MPI_MAX, split_comm);
+      int end_count_max;
+      MPI_Allreduce(&end_count, &end_count_max, 1, MPI_INT, MPI_MAX, split_comm);
+      if(comm_rank==1)
+        std::cout << "Max ranks = " << max_nr_tasks << ", max start count = " <<
+          start_count_max << ", max end count = " << end_count_max << std::endl;  
+    }
+  }
+
+  MPI_Win_free(&win);
+  MPI_Free_mem(count);
+  MPI_Finalize();
+
+  return 0;
+}