Commit 70d121cd authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

more work on manuscript.


Former-commit-id: be8549c54c1a26dcfa7c31b0c3478cea1183ec24
parent 84021614
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="744.09448819"
height="1052.3622047"
id="svg2"
version="1.1"
inkscape:version="0.48.3.1 r9886"
sodipodi:docname="CellLocking.svg">
<defs
id="defs4">
<marker
inkscape:stockid="Arrow2Mend"
orient="auto"
refY="0.0"
refX="0.0"
id="Arrow2Mend"
style="overflow:visible;">
<path
id="path3806"
style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
transform="scale(0.6) rotate(180) translate(0,0)" />
</marker>
</defs>
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="2"
inkscape:cx="319.93452"
inkscape:cy="689.70563"
inkscape:document-units="px"
inkscape:current-layer="layer1"
showgrid="true"
inkscape:window-width="1110"
inkscape:window-height="996"
inkscape:window-x="0"
inkscape:window-y="0"
inkscape:window-maximized="0"
inkscape:snap-global="true">
<inkscape:grid
type="xygrid"
id="grid2985" />
</sodipodi:namedview>
<metadata
id="metadata7">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title />
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1">
<rect
style="fill:#ffcf00;fill-opacity:1;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
id="rect2987"
width="20"
height="20"
x="310"
y="257.36218" />
<rect
y="302.36218"
x="325"
height="20"
width="20"
id="rect3757"
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
<rect
style="fill:#ffcf00;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1"
id="rect3759"
width="20"
height="20"
x="355"
y="302.36218" />
<rect
style="fill:#ff0000;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1"
id="rect3761"
width="20"
height="20"
x="265"
y="302.36218" />
<rect
y="302.36218"
x="295"
height="20"
width="20"
id="rect3763"
style="fill:#ffcf00;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1" />
<rect
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
id="rect3765"
width="20"
height="20"
x="400"
y="347.36218" />
<rect
y="347.36218"
x="430.5"
height="20"
width="20"
id="rect3767"
style="fill:#ff0000;fill-opacity:1;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
<rect
y="347.36218"
x="340"
height="20"
width="20"
id="rect3769"
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
<rect
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
id="rect3771"
width="20"
height="20"
x="370"
y="347.36218" />
<path
sodipodi:nodetypes="cccc"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
d="m 440,347.36218 0,-10 -75,0 0,-15"
id="path4233"
inkscape:connector-curvature="0" />
<path
sodipodi:nodetypes="ccc"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 350,347.36218 0,-10 15,0"
id="path4235"
inkscape:connector-curvature="0" />
<path
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 365,337.36218 15,0 0,10"
id="path4237"
inkscape:connector-curvature="0" />
<path
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 380,337.36218 30,0 0,10"
id="path4239"
inkscape:connector-curvature="0" />
<path
inkscape:connector-curvature="0"
id="path4249"
d="m 275,302.36218 0,-10 45,0 0,-15"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
sodipodi:nodetypes="cccc" />
<path
inkscape:connector-curvature="0"
id="path4251"
d="m 305,302.36218 0,-10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<path
inkscape:connector-curvature="0"
id="path4253"
d="m 320,292.36218 15,0 0,10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<path
inkscape:connector-curvature="0"
id="path4255"
d="m 335,292.36218 30,0 0,10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<rect
y="347.36218"
x="265"
height="20"
width="20"
id="rect4275"
style="fill:#ff0000;fill-opacity:1;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
<rect
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
id="rect4277"
width="20"
height="20"
x="295"
y="347.36218" />
<rect
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
id="rect4279"
width="20"
height="20"
x="205"
y="347.36218" />
<rect
y="347.36218"
x="235"
height="20"
width="20"
id="rect4281"
style="fill:none;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
<path
sodipodi:nodetypes="cccc"
inkscape:connector-curvature="0"
id="path4285"
d="m 215,347.36218 0,-10 90,0 0,-15"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Mend)" />
<path
inkscape:connector-curvature="0"
id="path4287"
d="m 245,347.36218 0,-10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<path
sodipodi:nodetypes="ccc"
inkscape:connector-curvature="0"
id="path4289"
d="m 275,337.36218 0,0 0,10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<path
inkscape:connector-curvature="0"
id="path4291"
d="m 275,337.36218 30,0 0,10"
style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
<text
xml:space="preserve"
style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
x="304.91455"
y="316.0072"
id="text4872"
sodipodi:linespacing="125%"><tspan
sodipodi:role="line"
id="tspan4874"
x="304.91455"
y="316.0072"
style="font-size:10px;text-align:center;text-anchor:middle">1</tspan></text>
<text
sodipodi:linespacing="125%"
id="text4876"
y="316.0072"
x="364.91455"
style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
xml:space="preserve"><tspan
style="font-size:10px;text-align:center;text-anchor:middle"
y="316.0072"
x="364.91455"
id="tspan4878"
sodipodi:role="line">1</tspan></text>
<text
xml:space="preserve"
style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
x="319.91455"
y="271.0072"
id="text4880"
sodipodi:linespacing="125%"><tspan
sodipodi:role="line"
id="tspan4882"
x="319.91455"
y="271.0072"
style="font-size:10px;text-align:center;text-anchor:middle">3</tspan></text>
</g>
</svg>
......@@ -66,7 +66,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Title, author and affiliations
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Fast Algorithms for Smoothed Particle Hydrodynamics on Multi-Core
\title{Efficient and Scalable Algorithms for Smoothed Particle Hydrodynamics on Multi-Core
Architectures}
\author{Pedro Gonnet\thanks{School of Engineering and Computing Sciences,
Durham University, Durham, Untied Kingdom ({\tt pedro.gonnet@durham.ac.uk}).}}
......@@ -95,7 +95,11 @@
% Abstract
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
Bla.
A new framework for the parallelization of Smoothed Particle Hydrodynamics (SPH)
simulations on shared-memory parallel architectures is described.
This framework relies on fast and cache-efficient cell-based neighbour-finding
algorithms, as well as task-based parallelism to achieve good scaling and
parallel efficiency on mult-core computers.
\end{abstract}
......@@ -115,7 +119,7 @@ multi-cores
\pagestyle{myheadings}
\thispagestyle{plain}
\markboth{P. GONNET}{FAST ALGORITHMS FOR SPH}
\markboth{P. GONNET}{EFFICIENT AND SCALABLE ALGORITHMS FOR SPH}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -627,21 +631,17 @@ The dependencies and conflicts between tasks are then given as follows:
\subsubsection{Task queues}
\begin{itemize}
\item If the dependencies and conflicts are defined correctly, then
there is no risk of concurrency problems and thus each task
can be implemented without special attention to the latter,
e.g.~it can update data without using exclusinve access barriers
or atomic memory updates.
\item This, however, requires some care in how the individual tasks
are allocated to the computing threads, i.e.~each task should
be allocated once to a single thread, and should not have
and unresolved dependencies, or conflict with any concurrently
executing tasks.
\item In the following, tasks will be stored in one or more {\em queues}:
If the dependencies and conflicts are defined correctly, then
there is no risk of concurrency problems and thus each task
can be implemented without special attention to the latter,
e.g.~it can update data without using exclusinve access barriers
or atomic memory updates.
This, however, requires some care in how the individual tasks
are allocated to the computing threads, i.e.~each task should
be allocated once to a single thread, and should not have
and unresolved dependencies, or conflict with any concurrently
executing tasks.
In the following, tasks will be stored in one or more {\em queues}:
\begin{center}\begin{minipage}{0.8\textwidth}
\begin{lstlisting}
......@@ -660,7 +660,7 @@ in the queue.
The {\tt pthread\_mutex\_t lock} is used to guarantee exclusive access
to the queue.
\item Task IDs are retreived from the queue as follows:
Task IDs are retreived from the queue as follows:
\begin{center}\begin{minipage}{0.8\textwidth}
\begin{lstlisting}
......@@ -688,72 +688,145 @@ int queue_gettask ( struct queue *q , int steal ) {
\end{lstlisting}
\end{minipage}\end{center}
\noindent i.e.~exclusive access to the queue is obtained by locking
its mutex in line~2. In lines~3 to~6, the tasks are inspected
in sequence until a task is found that has no unresolved
dependencies or existing conflicts.
If a task has been found, its ID is swapped with that at
position {\tt next}, and {\tt next} is incremented by one
(lines 8~to~11).
The lock on the queue is then released (line~12) and
the task ID, or {\tt -1} if no available task was found, is
returned.
\item The advantage of swapping the retreived task to the next
position in the list is that if the queue is reset, e.g.~{\tt next}
is set to zero, and used again with the same set of tasks,
they will now be traversed in the order in which they were
exectuted in the previous run.
This provides a basic form of iterative refinement of the task
order.
\item The tasks can also be sorted topologically, according to their
dependency graph, to help minimize the effort required to find
a valid task.
\item The mutex at the start of {\tt queue\_gettask} is a potential
bottleneck if the time required to process a task is small
compared to the time required for all the threads to obtain
a task, e.g.~for large numbers of very small tasks and/or
a large number of threads.
\item One way of avoiding this problem is to use several concurrent
queues, e.g.~one queue per thread, and spread the tasks over
all queues.
\item A fixed assignemnt of tasks to queues can, however,
cause load balancing problems, e.g.~when a thread's queue is
empty before the others have finished.
\item In order to avoid such problems, {\em work-stealing} can be used:
If a thread cannot obtain a task from its own queue, it picks
another queue at random and tries to {\em steal} a task from it
i.e. if it can obtain a task, it removes it from the queue and
adds it to it's own queue, thus iteratively rebalancing
the task queues if they are used repeatedly:
\noindent i.e.~exclusive access to the queue is obtained by locking
its mutex in line~2. In lines~3 to~6, the tasks are inspected
in sequence until a task is found that has no unresolved
dependencies or existing conflicts.
If a task has been found, its ID is swapped with that at
position {\tt next}, and {\tt next} is incremented by one
(lines 8~to~11).
The lock on the queue is then released (line~12) and
the task ID, or {\tt -1} if no available task was found, is
returned.
The advantage of swapping the retreived task to the next
position in the list is that if the queue is reset, e.g.~{\tt next}
is set to zero, and used again with the same set of tasks,
they will now be traversed in the order in which they were
exectuted in the previous run.
This provides a basic form of iterative refinement of the task
order.
The tasks can also be sorted topologically, according to their
dependency graph, to help minimize the effort required to find
a valid task.
The mutex at the start of {\tt queue\_gettask} is a potential
bottleneck if the time required to process a task is small
compared to the time required for all the threads to obtain
a task, e.g.~for large numbers of very small tasks and/or
a large number of threads.
One way of avoiding this problem is to use several concurrent
queues, e.g.~one queue per thread, and spread the tasks over
all queues.
A fixed assignemnt of tasks to queues can, however,
cause load balancing problems, e.g.~when a thread's queue is
empty before the others have finished.
In order to avoid such problems, {\em work-stealing} can be used:
If a thread cannot obtain a task from its own queue, it picks
another queue at random and tries to {\em steal} a task from it
i.e. if it can obtain a task, it removes it from the queue and
adds it to it's own queue, thus iteratively rebalancing
the task queues if they are used repeatedly:
\begin{center}\begin{minipage}{0.8\textwidth}
\begin{lstlisting}
while ( there is still a task in any of the queues ) {
while ( there is still a task in any of the queues ) {
if ( ( tid = queue_gettask( myq , 0 ) ) < 0 ) {
randq = pick a non-empty queue at random.
if ( ( tid = queue_gettask( randq , 1 ) ) >= 0 )
queue_addtask( myq , tid );
}
if ( tid >= 0 )
execute task tid.
}
\end{lstlisting}
\end{minipage}\end{center}
\end{itemize}
\noindent where {\tt myq} is the queue associated with the
current thread and {\tt queue\_addtask} adds a task ID
to the given queue.
\subsubsection{Cell locking}
\begin{itemize}
\item Explain problem of hierarchical locking, i.e.~interactions
involving higher-level cells exclude lower-level cells.
Particles within a cell are also within that cell's hierarchical
parents.
Therefore, when working on the particles of a cell, tasks which
operate on its parent's data should not be allowed to execute.
One way to avoid this problem is to require that a task
not only lock a cell, but also all of its hierarchical
parents in order to operate on its data.
This, however, would prevent tasks involving siblings,
whose particle sets do not overlap, from executing.
We avoid this problem by giving each cell both a {\em lock},
and a {\em hold} counter:
\item Two-phase locking up and down the cell hierarchy, with
``lock'' and ``hold'' states.
\begin{center}\begin{minipage}{0.8\textwidth}
\begin{lstlisting}
int cell_locktree ( struct cell c ) {
struct cell *c1, *c2;
if ( trylock( c->lock ) != 0 )
return 1;
if ( c->hold > 0 ) {
unlock( c->lock )
return 1;
}
for ( c1 = c->parent ; c1 != NULL ; c1 = c1->parent ) {
if ( trylock( c1->lock ) != 0 )
break;
atomic_add( c1->hold , 1 );
unlock( c1->lock );
}
if ( finger != NULL ) {
for ( c2 = c->parent ; c2 != c1 ; c2 = c2->parent )
atomic_sub( c2->hold , 1 );
unlock( c->lock );
return 1;
}
else
return 0;
}
\end{lstlisting}
\end{minipage}\end{center}
\end{itemize}
\noindent When trying to lock a cell, we first check that it is neither
locked (line 3) or held (line 5), i.e.~its hold counter is zero, and lock it.
We then travel up the hierarchy increasing the
hold counter of each cell on the way, up to the topmost cell (lines 9--14).
If any cell along the hierarchy is locked (line 10), the locking is aborted
and all locks and holds are undone (lines 15--20, see \fig{CellLocking}).
The operations {\tt atomic\_add} and {\tt atomic\_sub} are understood,
respectively, to increase or decrease a value atomically.
When the cell is released, its lock is unlocked and the hold
counter of all hierarchical parents is decreased by one:
\begin{center}\begin{minipage}{0.8\textwidth}
\begin{lstlisting}
void cell_unlocktree ( struct cell c ) {
struct cell *c1;
unlock( c->lock )
for ( c1 = c->parent ; c1 != NULL ; c1 = c1->parent ) {
atomic_sub( c1->hold , 1 );
}
\end{lstlisting}
\end{minipage}\end{center}
\begin{figure}[ht]
\centerline{\epsfig{file=figures/CellLocking.pdf,width=0.5\textwidth}}
\caption{Example of hierarchical cell locking. The cells marked in red
are ``locked'' while the cells marked in yellow have a ``hold'' count
larger than zero.
The hold count is shown inside each cell and corresponds to the number
of locked cells hierarchicaly below it.
All cells except for those locked or with a ``hold'' count larger than
zero can still be locked without causing concurrent data access.
}
\label{fig:CellLocking}
\end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -765,7 +838,14 @@ int queue_gettask ( struct queue *q , int steal ) {
\begin{itemize}
\item Details of the task queues.
\item Implemented in C, compiled with {\tt gcc}.
\item Threading implemented with {\tt pthread}.
\item One task queue per thread.
\item As of yet, no use of SIMD capabilities to evaluate several
interactions at a time.
\item Details of the pair interactions.
......@@ -778,7 +858,11 @@ int queue_gettask ( struct queue *q , int steal ) {
\begin{itemize}
\item Show both large and small simulation setups.
\item Details of the simulation used, e.g. size, number of particles,
etc...
\item So far only considering density and force computation,
particles not moving.
\end{itemize}
......@@ -801,7 +885,11 @@ int queue_gettask ( struct queue *q , int steal ) {
\begin{itemize}
\item Bla.
\item Good scaling.
\item Computational model can easily be exported to other architectures,
including GPUs (reference task-based parallelism on GPUs with Aidan),
and other multi-core accelerators such as the Intel MIC.
\end{itemize}
......@@ -811,7 +899,14 @@ int queue_gettask ( struct queue *q , int steal ) {
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgments}
ICC Durham, Nick Holliman's hardware, NVidia's hardware.
\begin{itemize}
\item Collaboration with Matthieu Schaller and Tom Theums from the
Institute of Computational Cosmology (ICC) at Durham University.
\item Lydia Heck from the ICC for providing access to the infrastructure.
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment