diff --git a/src/cache.h b/src/cache.h
index c1db1e882a7cdb52055d407f8aa1fc52ff82c187..98352a36168b3fe03ca0601b74129800386f90e2 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -136,10 +136,12 @@ struct c2_cache {
 __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
                                                       size_t count) {
 
-  /* Align cache on correct byte boundary and pad cache size to include 2 vector
-   * lengths for remainder operations. */
+  /* Align cache on correct byte boundary and pad cache size to be a multiple of the vector size 
+   * and include 2 vector lengths for remainder operations. */
   unsigned long alignment = sizeof(float) * VEC_SIZE;
-  unsigned int sizeBytes = (count + (2 * VEC_SIZE)) * sizeof(float);
+  unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
+  if (rem > 0) pad += VEC_SIZE - rem;
+  unsigned int sizeBytes = (count + pad) * sizeof(float);
   int error = 0;
 
   /* Free memory if cache has already been allocated. */
@@ -243,7 +245,7 @@ __attribute__((always_inline)) INLINE void cache_read_cell_sorted(
   int idx;
   /* Shift the particles positions to a local frame (ci frame) so single precision can be
    * used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
 #pragma simd
 #endif
   for (int i = 0; i < ci->count; i++) {
@@ -277,8 +279,10 @@ __attribute__((always_inline)) INLINE void cache_read_cell_sorted(
  *
  * @param ci The i #cell.
  * @param cj The j #cell.
- * @param ci_cache The cache for cell ci.
- * @param cj_cache The cache for cell cj.
+ * @param ci_cache The #cache for cell ci.
+ * @param cj_cache The #cache for cell cj.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param sort_j The array of sorted particle indices for cell ci.
  * @param shift The amount to shift the particle positions to account for BCs
  */
 __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
@@ -287,7 +291,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
   int idx;
   /* Shift the particles positions to a local frame (ci frame) so single precision can be
    * used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
 #pragma simd
 #endif
   for (int i = 0; i < ci->count; i++) {
@@ -314,7 +318,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
 #endif
   }
  
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
 #pragma simd
 #endif
   for (int i = 0; i < cj->count; i++) {
@@ -341,60 +345,68 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
   }
 }
 
+/**
+ * @brief Populate caches by only reading particles that are within range of each other within the adjoining cell.Also read the particles into the cache in sorted order.
+ *
+ * @param ci The i #cell.
+ * @param cj The j #cell.
+ * @param ci_cache The #cache for cell ci.
+ * @param cj_cache The #cache for cell cj.
+ * @param sort_i The array of sorted particle indices for cell ci.
+ * @param sort_j The array of sorted particle indices for cell ci.
+ * @param shift The amount to shift the particle positions to account for BCs
+ * @param first_pi The first particle in cell ci that is in range.
+ * @param last_pj The last particle in cell cj that is in range.
+ * @param num_vec_proc Number of vectors that will be used to process the interaction.
+ */
 __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted_2(
-    const struct cell *const ci, const struct cell *const cj, struct cache *const ci_cache, struct cache *const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift, int first_pi, int last_pj, const int num_vec_proc) {
+    const struct cell *const ci, const struct cell *const cj, struct cache *const ci_cache, struct cache *const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift, int *first_pi, int *last_pj, const int num_vec_proc) {
 
-  int idx;
-  /* Shift the particles positions to a local frame (ci frame) so single precision can be
-   * used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
-  /* Pad number of particles read to the vector size. */
-  int rem = (ci->count - first_pi) % (num_vec_proc * VEC_SIZE);
+  int idx, ci_cache_idx;
+    /* Pad number of particles read to the vector size. */
+  int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
   if (rem != 0) {
     int pad = (num_vec_proc * VEC_SIZE) - rem;
     
-    if (first_pi - pad >= 0)
-      first_pi -= pad;
+    if (*first_pi - pad >= 0)
+      *first_pi -= pad;
   }
 
-  rem = last_pj % (num_vec_proc * VEC_SIZE);
+  rem = *last_pj % (num_vec_proc * VEC_SIZE);
   if (rem != 0) {
     int pad = (num_vec_proc * VEC_SIZE) - rem;
 
-    if (last_pj + pad < cj->count)
-      last_pj += pad;
+    if (*last_pj + pad < cj->count)
+      *last_pj += pad;
   }
 
-#ifdef WITH_VECTORIZATION
+  int first_pi_align = *first_pi;
+  int last_pj_align = *last_pj;
+
+  /* Shift the particles positions to a local frame (ci frame) so single precision can be
+   * used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
 #pragma simd
 #endif
-  for (int i = first_pi; i < ci->count; i++) {
+  for (int i = first_pi_align; i < ci->count; i++) {
+    /* Make sure ci_cache is filled from the first element. */
+    ci_cache_idx = i - first_pi_align;
     idx = sort_i[i].i;
-    ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
-    ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
-    ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
-    ci_cache->h[i] = ci->parts[idx].h;
-
-    ci_cache->m[i] = ci->parts[idx].mass;
-    ci_cache->vx[i] = ci->parts[idx].v[0];
-    ci_cache->vy[i] = ci->parts[idx].v[1];
-    ci_cache->vz[i] = ci->parts[idx].v[2];
-
-#ifdef DOPAIR1_AUTO_VEC
-    ci_cache->rho[i]         = 0.0f; 
-    ci_cache->rho_dh[i]      = 0.0f; 
-    ci_cache->wcount[i]      = 0.0f; 
-    ci_cache->wcount_dh[i]   = 0.0f; 
-    ci_cache->div_v[i]       = 0.0f; 
-    ci_cache->curl_vx[i]     = 0.0f; 
-    ci_cache->curl_vy[i]     = 0.0f; 
-    ci_cache->curl_vz[i]     = 0.0f; 
-#endif
+    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
+    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
+    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
+    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
+
+    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
+    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
+    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
+    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
   }
  
-#ifdef WITH_VECTORIZATION
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
 #pragma simd
 #endif
-  for (int i = 0; i <= last_pj; i++) {
+  for (int i = 0; i <= last_pj_align; i++) {
     idx = sort_j[i].i;
     cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
     cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
@@ -405,16 +417,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted_2(
     cj_cache->vx[i] = cj->parts[idx].v[0];
     cj_cache->vy[i] = cj->parts[idx].v[1];
     cj_cache->vz[i] = cj->parts[idx].v[2];
-#ifdef DOPAIR1_AUTO_VEC
-    cj_cache->rho[i]         = 0.0f; 
-    cj_cache->rho_dh[i]      = 0.0f; 
-    cj_cache->wcount[i]      = 0.0f; 
-    cj_cache->wcount_dh[i]   = 0.0f; 
-    cj_cache->div_v[i]       = 0.0f; 
-    cj_cache->curl_vx[i]     = 0.0f; 
-    cj_cache->curl_vy[i]     = 0.0f; 
-    cj_cache->curl_vz[i]     = 0.0f; 
-#endif
   }
 }