diff --git a/src/cache.h b/src/cache.h
index 1b675e1cc0da5daab53ef14fa06106eca186bd15..3eb1e194dd4232319ac1d4a4323ca8099f044063 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -296,11 +296,16 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     const struct entry *restrict sort_j, const double *restrict const shift,
     int *first_pi, int *last_pj) {
 
-  /* Pad number of particles read to the vector size. */
+  /* Make the number of particles to be read a multiple of the vector size.
+   * This eliminates serial remainder loops where possible when populating the
+   * cache. */
+
+  /* Is the number of particles to read a multiple of the vector size? */
   int rem = (ci->count - *first_pi) % VEC_SIZE;
   if (rem != 0) {
     int pad = VEC_SIZE - rem;
 
+    /* Decrease first_pi if there are particles in the cell left to read. */
     if (*first_pi - pad >= 0) *first_pi -= pad;
   }
 
@@ -308,6 +313,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
   if (rem != 0) {
     int pad = VEC_SIZE - rem;
 
+    /* Increase last_pj if there are particles in the cell left to read. */
     if (*last_pj + pad < cj->count) *last_pj += pad;
   }
 
@@ -504,11 +510,16 @@ cache_read_two_partial_cells_sorted_force(
     const struct entry *restrict sort_i, const struct entry *restrict sort_j,
     const double *const shift, int *first_pi, int *last_pj) {
 
-  /* Pad number of particles read to the vector size. */
+  /* Make the number of particles to be read a multiple of the vector size.
+   * This eliminates serial remainder loops where possible when populating the
+   * cache. */
+
+  /* Is the number of particles to read a multiple of the vector size? */
   int rem = (ci->count - *first_pi) % VEC_SIZE;
   if (rem != 0) {
     int pad = VEC_SIZE - rem;
 
+    /* Decrease first_pi if there are particles in the cell left to read. */
     if (*first_pi - pad >= 0) *first_pi -= pad;
   }
 
@@ -516,6 +527,7 @@ cache_read_two_partial_cells_sorted_force(
   if (rem != 0) {
     int pad = VEC_SIZE - rem;
 
+    /* Increase last_pj if there are particles in the cell left to read. */
     if (*last_pj + pad < cj->count) *last_pj += pad;
   }
 
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 96bc4d049cbb773751409c6e91ba7196e88e1162..4403513177a6ce59a765bef4e277a8c8457ae356 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -297,7 +297,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
     const float first_di =
         sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift;
 
-    /* Loop through particles in cell j until they are not in range of pi. */
+    /* Loop through particles in cell j until they are not in range of pi.
+     * Make sure that temp stays between 0 and cj->count - 1.*/
     while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
 
     max_index_i[first_pi] = temp;
@@ -309,6 +310,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
 
       const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift;
 
+      /* Make sure that temp stays between 0 and cj->count - 1.*/
       while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
 
       max_index_i[i] = temp;
@@ -438,7 +440,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
                            max(pi->h, hj_max_raw) * kernel_gamma + dx_max -
                            rshift;
 
-    /* Loop through particles in cell j until they are not in range of pi. */
+    /* Loop through particles in cell j until they are not in range of pi.
+     * Make sure that temp stays between 0 and cj->count - 1.*/
     while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
 
     max_index_i[first_pi] = temp;
@@ -451,6 +454,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
       const float di =
           sort_i[i].d + max(pi->h, hj_max_raw) * kernel_gamma + dx_max - rshift;
 
+      /* Make sure that temp stays between 0 and cj->count - 1.*/
       while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
 
       max_index_i[i] = temp;
@@ -1092,7 +1096,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
           exit_iteration_align += pad;
       }
 
-      /* Loop over the parts in cj. */
+      /* Loop over the parts in cj. Making sure to perform an iteration of the
+       * loop even if exit_iteration_align is zero and there is only one
+       * particle to interact with.*/
       for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) {
 
         /* Get the cache index to the jth particle. */
@@ -1473,7 +1479,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
           exit_iteration_align += pad;
       }
 
-      /* Loop over the parts in cj. */
+      /* Loop over the parts in cj. Making sure to perform an iteration of the
+       * loop even if exit_iteration_align is zero and there is only one
+       * particle to interact with.*/
       for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) {
 
         /* Get the cache index to the jth particle. */