Commit 64ca63cb authored by James Willis's avatar James Willis
Browse files

Formatting.

parent db6f5c73
......@@ -26,8 +26,8 @@
#include "cell.h"
#include "error.h"
#include "part.h"
#include "vector.h"
#include "sort.h"
#include "vector.h"
#define NUM_VEC_PROC 2
#define CACHE_ALIGN sizeof(float) * VEC_SIZE
......@@ -61,13 +61,12 @@ struct cache {
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
/* Cache size. */
int count;
};
/* Secondary cache struct to hold a list of interactions between two
......@@ -108,7 +107,8 @@ struct c2_cache {
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
size_t count) {
/* Align cache on correct byte boundary and pad cache size to be a multiple of the vector size
/* Align cache on correct byte boundary and pad cache size to be a multiple of
* the vector size
* and include 2 vector lengths for remainder operations. */
unsigned long alignment = sizeof(float) * VEC_SIZE;
unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
......@@ -138,7 +138,7 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
error += posix_memalign((void **)&c->vz, alignment, sizeBytes);
error += posix_memalign((void **)&c->h, alignment, sizeBytes);
error += posix_memalign((void **)&c->max_d, alignment, sizeBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
c->count = count;
......@@ -173,7 +173,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
}
/**
* @brief Populate cache by reading in the particles from two cells in unsorted order.
* @brief Populate cache by reading in the particles from two cells in unsorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
......@@ -182,10 +183,14 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells(
const struct cell *const ci, const struct cell *const cj, struct cache *const ci_cache, struct cache *const cj_cache, const double *const shift) {
/* Shift the particles positions to a local frame (ci frame) so single precision can be
* used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const double *const shift) {
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be
* used instead of double precision. Also shift the cell ci, particles
* positions due to BCs but leave cell cj. */
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
......@@ -197,7 +202,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells(
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
}
for (int i = 0; i < cj->count; i++) {
cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
......@@ -212,17 +217,21 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells(
}
__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
const struct cell *const ci, struct cache *const ci_cache, const struct entry *restrict sort_i, double *const loc, double *const shift) {
const struct cell *const ci, struct cache *const ci_cache,
const struct entry *restrict sort_i, double *const loc,
double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision can be
* used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < ci->count; i++) {
idx = sort_i[i].i;
ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
......@@ -236,7 +245,8 @@ __attribute__((always_inline)) INLINE void cache_read_cell_sorted(
}
/**
* @brief Populate cache by reading in the particles from two cells in sorted order.
* @brief Populate cache by reading in the particles from two cells in sorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
......@@ -247,11 +257,16 @@ __attribute__((always_inline)) INLINE void cache_read_cell_sorted(
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
const struct cell *const ci, const struct cell *const cj, struct cache *const ci_cache, struct cache *const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift) {
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision can be
* used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
......@@ -267,7 +282,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
ci_cache->vy[i] = ci->parts[idx].v[1];
ci_cache->vz[i] = ci->parts[idx].v[2];
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
......@@ -286,7 +301,9 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
}
/**
* @brief Populate caches by only reading particles that are within range of each other within the adjoining cell.Also read the particles into the cache in sorted order.
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
......@@ -297,17 +314,22 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the interaction.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction.
*/
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct cell *const ci, const struct cell *const cj, struct cache *const ci_cache, struct cache *const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *const shift, int *first_pi, int *last_pj, const int num_vec_proc) {
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj,
const int num_vec_proc) {
int idx, ci_cache_idx;
/* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
if (*first_pi - pad >= 0) *first_pi -= pad;
}
......@@ -321,8 +343,10 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
int first_pi_align = *first_pi;
int last_pj_align = *last_pj;
/* Shift the particles positions to a local frame (ci frame) so single precision can be
* used instead of double precision. Also shift the cell ci, particles positions due to BCs but leave cell cj. */
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
......@@ -341,9 +365,10 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
}
float fake_pix = 2.0f * ci_cache->x[ci->count - 1];
for(int i=ci->count - first_pi_align; i<ci->count - first_pi_align + VEC_SIZE; i++)
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++)
ci_cache->x[i] = fake_pix;
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
......@@ -361,7 +386,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
}
float fake_pjx = 2.0f * cj_cache->x[last_pj_align];
for(int i=last_pj_align + 1; i<last_pj_align + 1 + VEC_SIZE; i++)
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++)
cj_cache->x[i] = fake_pjx;
}
......
......@@ -32,8 +32,8 @@
* Gadget-2 tree-code neighbours search.
*/
#include "minmax.h"
#include "cache.h"
#include "minmax.h"
/**
* @brief Density loop
......@@ -275,8 +275,16 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
pi->density.rot_v[2] += fac * curlvr[2];
}
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density_jsw(
const float r2, const float hig2, const float dx, const float dy, const float dz, const float h_inv, const float hj, const float vi_x, const float vi_y, const float vi_z, const float vj_x, const float vj_y, const float vj_z, const float mj, float *const restrict rho, float *const restrict rho_dh, float *const restrict wcount, float *const restrict wcount_dh, float *const restrict div_v, float *const restrict curl_vx, float *const restrict curl_vy, float *const restrict curl_vz) {
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_density_jsw(
const float r2, const float hig2, const float dx, const float dy,
const float dz, const float h_inv, const float hj, const float vi_x,
const float vi_y, const float vi_z, const float vj_x, const float vj_y,
const float vj_z, const float mj, float *const restrict rho,
float *const restrict rho_dh, float *const restrict wcount,
float *const restrict wcount_dh, float *const restrict div_v,
float *const restrict curl_vx, float *const restrict curl_vy,
float *const restrict curl_vz) {
if (r2 < hig2) {
......@@ -291,7 +299,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density_jsw
kernel_deval(u, &wi, &wi_dx);
const float fac = mj * wi_dx * ri;
/* Compute dv dot r */
const float dv_x = vi_x - vj_x;
const float dv_y = vi_y - vj_y;
......@@ -431,7 +439,7 @@ runner_iact_nonsym_intrinsic_vec_density(
vector vjx, vjy, vjz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */
mj.v = vec_unaligned_load(Mj);
vjx.v = vec_unaligned_load(Vjx);
......@@ -488,18 +496,19 @@ runner_iact_nonsym_intrinsic_vec_density(
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask.v);
vec_mul(xi.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
......@@ -511,36 +520,44 @@ runner_iact_nonsym_intrinsic_vec_density(
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_intrinsic_vec_2_density(
const struct cache *const cj_cache, const int *const indices, vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix,
vector viy, vector viz,
vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
vector mask, int knlMask) {
const struct cache *const cj_cache, const int *const indices, vector *r2,
vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy,
vector viz, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
vector *curlvySum, vector *curlvzSum, vector mask, int knlMask) {
//vector r, ri, r2, xi, wi, wi_dx;
// vector r, ri, r2, xi, wi, wi_dx;
vector r, ri, xi, wi, wi_dx;
vector mj;
//vector dx, dy, dz, dvx, dvy, dvz;
// vector dx, dy, dz, dvx, dvy, dvz;
vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */
mj.v = vec_set(cj_cache->m[indices[0]], cj_cache->m[indices[1]], cj_cache->m[indices[2]], cj_cache->m[indices[3]],
cj_cache->m[indices[4]], cj_cache->m[indices[5]], cj_cache->m[indices[6]], cj_cache->m[indices[7]]);
vjx.v = vec_set(cj_cache->vx[indices[0]], cj_cache->vx[indices[1]], cj_cache->vx[indices[2]], cj_cache->vx[indices[3]],
cj_cache->vx[indices[4]], cj_cache->vx[indices[5]], cj_cache->vx[indices[6]], cj_cache->vx[indices[7]]);
vjy.v = vec_set(cj_cache->vy[indices[0]], cj_cache->vy[indices[1]], cj_cache->vy[indices[2]], cj_cache->vy[indices[3]],
cj_cache->vy[indices[4]], cj_cache->vy[indices[5]], cj_cache->vy[indices[6]], cj_cache->vy[indices[7]]);
vjz.v = vec_set(cj_cache->vz[indices[0]], cj_cache->vz[indices[1]], cj_cache->vz[indices[2]], cj_cache->vz[indices[3]],
cj_cache->vz[indices[4]], cj_cache->vz[indices[5]], cj_cache->vz[indices[6]], cj_cache->vz[indices[7]]);
//dx.v = vec_load(Dx);
//dy.v = vec_load(Dy);
//dz.v = vec_load(Dz);
mj.v = vec_set(cj_cache->m[indices[0]], cj_cache->m[indices[1]],
cj_cache->m[indices[2]], cj_cache->m[indices[3]],
cj_cache->m[indices[4]], cj_cache->m[indices[5]],
cj_cache->m[indices[6]], cj_cache->m[indices[7]]);
vjx.v = vec_set(cj_cache->vx[indices[0]], cj_cache->vx[indices[1]],
cj_cache->vx[indices[2]], cj_cache->vx[indices[3]],
cj_cache->vx[indices[4]], cj_cache->vx[indices[5]],
cj_cache->vx[indices[6]], cj_cache->vx[indices[7]]);
vjy.v = vec_set(cj_cache->vy[indices[0]], cj_cache->vy[indices[1]],
cj_cache->vy[indices[2]], cj_cache->vy[indices[3]],
cj_cache->vy[indices[4]], cj_cache->vy[indices[5]],
cj_cache->vy[indices[6]], cj_cache->vy[indices[7]]);
vjz.v = vec_set(cj_cache->vz[indices[0]], cj_cache->vz[indices[1]],
cj_cache->vz[indices[2]], cj_cache->vz[indices[3]],
cj_cache->vz[indices[4]], cj_cache->vz[indices[5]],
cj_cache->vz[indices[6]], cj_cache->vz[indices[7]]);
// dx.v = vec_load(Dx);
// dy.v = vec_load(Dy);
// dz.v = vec_load(Dz);
/* Get the radius and inverse radius. */
//r2.v = vec_load(R2);
// r2.v = vec_load(R2);
ri = vec_reciprocal_sqrt(*r2);
r.v = vec_mul(r2->v, ri.v);
......@@ -590,18 +607,19 @@ runner_iact_nonsym_intrinsic_vec_2_density(
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))), mask.v);
vec_mul(xi.v, wi_dx.v))),
mask.v);
wcountSum->v += vec_and(wi.v, mask.v);
wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
......@@ -629,7 +647,7 @@ runner_iact_nonsym_1_vec_density(
vector vjx, vjy, vjz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */
mj.v = vec_load(Mj);
vjx.v = vec_load(Vjx);
......@@ -690,15 +708,15 @@ runner_iact_nonsym_1_vec_density(
curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
curlvxSum->v);
curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
curlvySum->v);
curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
curlvzSum->v);
#else
#else
rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
vec_mul(xi.v, wi_dx.v))),
......
......@@ -435,7 +435,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
dw_dx->v = (dw_dx->v * x.v) + w->v;
w->v = (x.v * w->v) + c[k].v;
}
#endif
/* Return everything */
......@@ -443,7 +443,6 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
......
......@@ -51,7 +51,7 @@ struct runner {
/*! The particle cache of cell ci. */
struct cache ci_cache;
/*! The particle cache of cell cj. */
struct cache cj_cache;
};
......
......@@ -2080,12 +2080,12 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid,
if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1);
if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1);
/* Compute the interactions. */
/* Compute the interactions. */
#if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \
defined(GADGET2_SPH)
runner_dopair1_density_vec(r, ci, cj);
#else
DOPAIR1(r, ci, cj);
DOPAIR1(r, ci, cj);
#endif
}
......
This diff is collapsed.
......@@ -24,20 +24,22 @@
#include "../config.h"
/* Local headers */
#include "active.h"
#include "cell.h"
#include "engine.h"
#include "hydro.h"
#include "part.h"
#include "runner.h"
#include "runner.h"
#include "timers.h"
#include "vector.h"
#include "active.h"
#include "runner.h"
/* Function prototypes. */
void runner_doself1_density_vec(struct runner *r, struct cell *restrict c);
void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c);
void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj);
void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci, struct cell *restrict cj);
void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci,
struct cell *restrict cj);
void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci,
struct cell *restrict cj);
#endif /* SWIFT_RUNNER_VEC_H */
......@@ -110,7 +110,8 @@
/* Performs a horizontal add on the vector and adds the result to a float. */
#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
/* Calculates the number of set bits in the mask and adds the result to an int. */
/* Calculates the number of set bits in the mask and adds the result to an int.
*/
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
pack += __builtin_popcount(mask);
......@@ -190,8 +191,8 @@
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
{ \
......@@ -216,8 +217,8 @@
/* Form a packed mask without intrinsics if AVX2 is not present. */
#ifndef VEC_FORM_PACKED_MASK
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
/* Takes an integer mask and forms a left-packed integer vector
* containing indices of the set bits in the integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
{ \
......@@ -225,8 +226,8 @@
if ((mask & (1 << i))) v_mask.i[pack++] = i; \
}
/* Takes two integer masks and forms two left-packed integer vectors
* containing indices of the set bits in each corresponding integer mask.
/* Takes two integer masks and forms two left-packed integer vectors
* containing indices of the set bits in each corresponding integer mask.
* Also returns the total number of bits set in the mask. */
#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \
{ \
......@@ -238,7 +239,7 @@
#endif
/* Performs a left-pack on a vector based upon a mask and returns the result. */
/* This uses AVX intrinsics, but this is slower than performing the left-pack
/* This uses AVX intrinsics, but this is slower than performing the left-pack
* manually by looping over the vectors. */
#ifndef VEC_LEFT_PACK
#define VEC_LEFT_PACK(a, mask, result) \
......
......@@ -335,13 +335,20 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count,
/* Just a forward declaration... */
void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_nosort_density(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, struct cell *cj);
void runner_dopair1_nosort_density(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci,
struct cell *cj);
void runner_doself1_density(struct runner *r, struct cell *ci);
void runner_doself1_density_vec(struct runner *r, struct cell *ci);
......@@ -485,8 +492,8 @@ int main(int argc, char *argv[]) {
cache_init(&runner.ci_cache, 512);
runner.cj_cache.count = 0;
cache_init(&runner.cj_cache, 512);
//cj_cache.count = 0;
//cache_init(&cj_cache, 512);
// cj_cache.count = 0;
// cache_init(&cj_cache, 512);
#endif
/* Run all the pairs */
......@@ -501,7 +508,7 @@ int main(int argc, char *argv[]) {
}
}
/* And now the self-interaction */
/* And now the self-interaction */
const ticks self_tic = getticks();
DOSELF1(&runner, main_cell);
......@@ -575,7 +582,8 @@ int main(int argc, char *argv[]) {
dump_particle_fields(outputFileName, main_cell, cells);
/* Check serial results against the vectorised results. */
//if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold))
// if (check_results(main_cell->parts, vec_parts, main_cell->count,
// threshold))
// message("Differences found...");
/* Output timing */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment