\documentclass{beamer}
% \documentclass[handout]{beamer}


%% Some packages
\usepackage{epsfig}
\usepackage{algorithmic}
\usepackage{array}
\usepackage{eurosym}
\usepackage{listings}
\lstset{basicstyle=\small\tt}
\usepackage[orientation=landscape,size=custom,width=16.8,height=10.5,scale=0.5]{beamerposter} 
%\usepackage[T1]{fontenc}


%% Some macros
\newcommand\dx{\,\mbox{d}x}


%% Table column types
\newcommand\PBS[1]{\let\temp=\\%
    #1%
    \let\\=\temp}
\newcolumntype{x}[1]
    {>{\PBS\raggedright\hspace{0pt}}p{#1}}

%% ETH font
\renewcommand{\familydefault}{let}
\renewcommand{\sfdefault}{let}
%\setbeamerfont{normal text}{family=\sffamily}


%% Colors
\xdefinecolor{grey}{rgb}{0.95,0.95,0.9375}
\xdefinecolor{myblue}{rgb}{0.664,0.762,0.867}
\xdefinecolor{myred}{rgb}{1.0,0.0,0.328}

\setbeamercolor{background canvas}{fg=black,bg=grey}
\setbeamercolor{normal text}{fg=black,bg=grey}
\setbeamercolor{itemize item}{fg=myblue}
\setbeamercolor{itemize subitem}{fg=myblue}
\setbeamercolor{itemize subsubitem}{fg=myblue}
\setbeamercolor{alert}{fg=myred}

\setbeamercolor{section in head/foot}{fg=white,bg=black}
\setbeamerfont{section in head/foot}{size=\LARGE}

\setbeamercolor{subsection in head/foot}{fg=white,bg=black}
\setbeamerfont{subsection in head/foot}{size=\normalsize}

\setbeamercolor{title in head/foot}{fg=white,bg=black}
\setbeamerfont{title in head/foot}{size=\tiny}

\setbeamercolor{date in head/foot}{fg=white,bg=black}
\setbeamerfont{date in head/foot}{size=\tiny}

\setbeamerfont{itemize subitem}{size=\normalsize}

\setbeamerfont{footnote}{size=\tiny}


%% Margins
\setbeamersize{text margin left=3ex}
\setbeamersize{text margin right=5ex}
\setbeamersize{sidebar width left=0ex}
\setbeamersize{sidebar width right=0ex}


%% Define the bullet
\setbeamertemplate{itemize item}[square]


%% Define the Headline
% \setbeamertemplate{headline}{
% \begin{beamercolorbox}[wd=0.8\paperwidth,ht=8ex,dp=2ex,leftskip=3ex]{section in head/foot}
%     \usebeamerfont{section in head/foot}\insertsection
% \end{beamercolorbox}
% \begin{beamercolorbox}[wd=0.8\paperwidth,ht=4ex,dp=3.5ex,leftskip=3ex,rightskip=3ex]{subsection in head/foot}
%     \usebeamerfont{subsection in head/foot}\insertsubsection
% \end{beamercolorbox}
% \begin{beamercolorbox}[right,wd=0.2\paperwidth,ht=12ex,dp=2ex,rightskip=3ex]{section in head/foot}
%     \includegraphics[width=10ex]{logo.pdf}
% \end{beamercolorbox}
% }
\setbeamertemplate{headline}{
\hbox{\begin{beamercolorbox}[wd=0.75\paperwidth,ht=14ex]{}
\begin{beamercolorbox}[wd=0.75\paperwidth,ht=8ex,dp=2ex,leftskip=3ex]{section in head/foot}
    \usebeamerfont{section in head/foot}\insertsection
\end{beamercolorbox}
\begin{beamercolorbox}[wd=0.75\paperwidth,ht=4ex,dp=3.5ex,leftskip=3ex,rightskip=3ex]{subsection in head/foot}
    \usebeamerfont{subsection in head/foot}\insertsubsection
\end{beamercolorbox}\end{beamercolorbox}%
\begin{beamercolorbox}[right,wd=0.25\paperwidth,ht=14ex,dp=3.5ex,rightskip=3ex]{section in head/foot}
    \vspace{-1ex}\includegraphics[height=12ex]{DU_W_O_med.png}
\end{beamercolorbox}}
}


%% Define the Footline
\setbeamertemplate{footline}{
\hbox{\begin{beamercolorbox}[wd=0.7\paperwidth,ht=3ex,dp=1.4ex,leftskip=3ex]{title in head/foot}
    \usebeamerfont{title in head/foot} \insertauthor: \inserttitle
\end{beamercolorbox}%
\begin{beamercolorbox}[right,wd=0.3\paperwidth,ht=3ex,dp=1.4ex,rightskip=3ex]{date in head/foot}
    \usebeamerfont{date in head/foot}\insertdate\hspace{1ex}\insertframenumber/\inserttotalframenumber
\end{beamercolorbox}}
}


%% Title
\title{SWIFT: Fast algorithms for SPH on multi-core architectures}
\subtitle{Fast neighbour-finding and task-based parallelism}
\author{Pedro Gonnet}
\date{June 5th, 2013}


%% Slides
\begin{document}

    {
    \setbeamercolor{background canvas}{fg=white,bg=black}
    % \usebackgroundtemplate{\includegraphics[width=\paperwidth]{eagle_50.png}}
        
    \section{Title}
    
    \begin{frame}[plain]
        \color{white}
        \vspace{15ex}
        {\bf\huge \inserttitle \\}
        \vspace{1.5ex}
        \Large \insertsubtitle \\
        \vspace{4ex}
        \large \insertauthor, Matthieu Schaller, Tom Theuns, Aidan Chalk \\
        \vspace{0.2ex} ECS/ICC, Durham University \\
        \vspace{0.2ex} 8th International SPHERIC Workshop, \insertdate \\
    \end{frame}
    }
    
    \setbeamercolor{background canvas}{fg=black,bg=grey}
    
    
    \section{Take-home messages}
    \subsection{What this talk is all about}
    
    \begin{frame}
        \begin{itemize}
        
            \pause
        
            \item<+-> In order to continue getting \alert<.>{\em faster}, programs
                need to become \alert<+>{\em more parallel}.
                
                % \vspace{0.5ex}
                
                \uncover<+->{$\longrightarrow$
                    If a program has reached its \alert<.>{maximum degree of parallelism},
                    it won't get any faster.} \uncover<+->{\alert<.>{Ever}.}
                
                \vspace{0.5ex}
                
            \item<+-> On shared-memory systems,
                asynchronous \alert<.>{task-based parallelism} solves most
                problems with concurrency and scaling.
                    
                % \vspace{0.5ex}
                
                \uncover<+->{$\longrightarrow$
                    But we still need to develop task-based algorithms
                    for \alert<.>{specific problems}, e.g.~SPH simulations.}
                
                \vspace{0.5ex}
                
            \item<+-> Better algorithms alone can lead to speedups
                of \alert<.>{up to a factor of ten}.
                
                % \vspace{0.5ex}
                
                \uncover<+->{$\longrightarrow$
                    Better use of both \alert<.>{existing} and \alert<+>{future}
                    infrastructure.}
                    
        \end{itemize}
    \end{frame}
        
    
    \subsection{Case in point}
    
    \begin{frame}
        \begin{itemize}
        
            \pause
        
            \item<+-> \alert<.>{Cosmological simulation} (galaxy formation) with
                \alert<+>{1.8\,M particles}
                in a cubic box of 6.25\,Mpc on a 4~$\times$ Intel Xeon X7550 with
                \alert<+>{32 cores}, 2\,GHz.
                
            \item<+-> \alert<.>{{\sc gadget}-3}, MPI-based,
               used for multi-billion particle simulations. 
                
            \item<+-> \alert<.>{\sc swift}, our own code for the same problem.
               % developed with the Institute of Computational Cosmology at
               % Durham University.
               %
               \uncover<+->{\alert<.>{13$\times$ faster} on 32 cores.}
                
        \end{itemize}
        
        \vspace{1ex}
        
        \uncover<5->{\only<-5>{\centerline{\epsfig{file=scaling_gadget.pdf, width=0.9\textwidth}}}}%
        \only<6>{\centerline{\epsfig{file=scaling.pdf, width=0.9\textwidth}}}%
        \only<7->{\centerline{\epsfig{file=scaling_red.pdf, width=0.9\textwidth}}}%
        
    \end{frame}
    
    
    \section{Task-based parallelism}
    \subsection{Main idea}
    
    \begin{frame}
        \begin{itemize}
        
        \pause

        \item<+-> \alert<.>{Shared-memory parallel programming paradigm}
            in which the computation is formulated in an
            \alert<+>{implicitly parallelizable} way that
            automatically avoids most of the problems associated
            with \alert<+>{concurrency and load-balancing}.
            
        \end{itemize}
        
        \vspace{-1.5ex}
                
        \begin{columns}
        
            \column{0.55\textwidth}
            \begin{itemize}
            
                \item<+-> We first reduce the problem to a set of inter-dependent
                    \alert<.>{tasks}.

                \item<+-> For each task, we need to know:
                    
                    \begin{itemize}
                        \item<+-> Which tasks it \alert<.>{depends} on,
                        \item<+-> Which tasks it \alert<.>{conflicts} with.
                    \end{itemize}
                    
                \item<+-> Each thread then \alert<.>{picks up a task} which
                    has no unresolved dependencies or conflicts and computes it.
                    
            \end{itemize}
            
            \column{0.45\textwidth}
                \only<5>{\centerline{\epsfig{file=img008.pdf,width=0.9\textwidth}}}%
                \only<6>{\centerline{\epsfig{file=img008h.pdf,width=0.9\textwidth}}}%
                \only<7>{\centerline{\epsfig{file=img008b.pdf,width=0.9\textwidth}}}%
                \only<8>{\centerline{\epsfig{file=img008c.pdf,width=0.9\textwidth}}}%
                \only<9>{\centerline{\epsfig{file=img008.pdf,width=0.9\textwidth}}}%
                \only<10>{\centerline{\epsfig{file=img008d.pdf,width=0.9\textwidth}}}%
                \only<11>{\centerline{\epsfig{file=img008e.pdf,width=0.9\textwidth}}}%
                \only<12>{\centerline{\epsfig{file=img008e2.pdf,width=0.9\textwidth}}}%
                \only<13>{\centerline{\epsfig{file=img008f.pdf,width=0.9\textwidth}}}%
                \only<14>{\centerline{\epsfig{file=img008f2.pdf,width=0.9\textwidth}}}%
            
        \end{columns}
    \end{frame}
    
    
    \subsection{Main advantages}
    
    \begin{frame}
        \begin{itemize}
        
            \pause
            
            \item<+-> The order in which the tasks are processed is
                \alert<.>{dynamic} and
                adapts automatically to load imbalances.
                
            \item<+-> If the dependencies and conflicts are specified correctly,
                we \alert<.>{do not have to worry about concurrency} at the level
                of the individual tasks.
                
                \uncover<+->{$\longrightarrow$ No need for expensive
                    \alert<.>{explicit}
                    locking, synchronization, or atomic operations.}
                    
            \item<+-> Each task has exclusive access to the data it
                is working on, thus \alert<.>{improving cache efficiency}.
            
            \item<+-> The same approach can be applied to more
                \alert<.>{unconventional many-core systems} such as GPUs.
            
            \item<+-> However, this usually means that
                we have to \alert<.>{re-think our entire computation},
                e.g.~redesign it from scratch to make it task-based.
                
        \end{itemize}
    \end{frame}
    
    
    \section{Task-based algorithms for SPH}
    \subsection{Neighbour-finding with trees}
    
    \begin{frame}
    
        \pause
    
        \begin{columns}
        
            \column{0.65\textwidth}
            \begin{itemize}

                \item<+-> In multi-resolution SPH,
                    \alert<.>{\alert<+>{Spatial trees}} are the most commonly used
                    approach to neighbour-finding, as the particle distribution
                    can be irregular.
                    
                \item<+-> Neighbour-finding up and down the tree is
                    \alert<.>{\alert<+>{simple}}, but has some problems:
                    
                    \begin{itemize}
                        \item<+-> Worst-case cost in 
                            \alert<.>{$\mathcal O(N^{2/3})$} per particle.
                        \item<+-> \alert<.>{Low cache efficiency} due to scattered
                            memory access.
                        \item<+-> Symmetries cannot be exploited, i.e.~each
                            particle pair is \alert<.>{found twice}.
                    \end{itemize}
                    
                \item<+-> Parallelization is \alert<.>{trivial}, but only because
                    symmetries are not exploited.

            \end{itemize}
            
            \column{0.35\textwidth}
                \only<2>{\centerline{\epsfig{file=Octree.pdf,width=0.8\textwidth}}}%
                \only<3>{\centerline{\epsfig{file=SearchTree_001.pdf,width=0.8\textwidth}}}%
                \only<4>{\centerline{\epsfig{file=SearchTree_002.pdf,width=0.8\textwidth}}}%
                \only<5->{\centerline{\epsfig{file=SearchTree_003.pdf,width=0.8\textwidth}}}%
            
        \end{columns}
    \end{frame}
    
    
    \subsection{Hierarchical cell pairs}
    
    \begin{frame}
    
        \pause

        \begin{columns}
        
            \column{0.65\textwidth}
            \begin{itemize}

                \item<+-> We start by splitting the simulation domain into
                    rectangular \alert<.>{cells} of edge length at least
                    $h_\mathsf{max}$.
                    
                \item<+-> All interacting particle pairs are then in either
                    in the \alert<.>{same cell}, or in a
                    \alert<+>{pair of neighbouring cells}.
                    
                \item<+-> Finding \alert<.>{all neighbours} within each cell
                    or between each
                    pair of cells can be used as a \alert<+>{task}.
                    %
                    \uncover<+->{$\longrightarrow$ Instead of traversing
                        the tree for each particle, we \alert<.>{traverse a list of
                        cells and cell pairs} and compute all interactions.}
                    
            \end{itemize}
            
            \column{0.35\textwidth}
                \only<2>{\centerline{\epsfig{file=InitialDecomp_001.pdf,width=0.8\textwidth}}}%
                \only<3-4>{\centerline{\epsfig{file=InitialDecomp_002.pdf,width=0.8\textwidth}}}%
                \only<5-7>{\centerline{\epsfig{file=InitialDecomp_004.pdf,width=0.8\textwidth}}}%
         
        \end{columns}
    \end{frame}
    
    
    \begin{frame}
    
        \begin{columns}
        
            \column{0.65\textwidth}
            \begin{itemize}
            
                \item<+-> In a mutli-resolution setting, the initial
                    cell-based decomposition can be \alert<.>{extremely
                    inefficient}.
                    
                \item<+-> If the particles in a cell 
                    are sufficiently small, the self-interaction task
                    can be \alert<.>{split}.
                    
                \item<+-> Likewise, if the particles in a cell-pair 
                    are sufficiently small, the task
                    can be \alert<.>{split} as well.
                    
                \item<+-> Finally, the particles in each cell pair are
                    \alert<.>{first sorted} along the cell pair axis to speed-up
                    neighbour-finding.

            \end{itemize}
            
            \column{0.35\textwidth}
                \only<2>{\centerline{\epsfig{file=SplitCell.pdf,width=0.4\textwidth}}}%
                \only<3>{\centerline{\epsfig{file=SplitPair.pdf,width=0.8\textwidth}}}%
                \only<4>{\centerline{\epsfig{file=SortedInteractions.pdf,width=0.8\textwidth}}}%
         
        \end{columns}
    \end{frame}
    
    
    \subsection{Task hierarchy}
    
    \begin{frame}
    
        \pause

        \begin{columns}
        
            \column{0.65\textwidth}
            \begin{itemize}

                \item<+-> \alert<.>{Three main task types}: \alert<+>{Sorting},
                    \alert<+>{self-interactions},
                    and \alert<+>{pair-interactions}.
                    
                \item<+-> \alert<.>{``Ghost'' tasks} are added to group dependencies
                    between the density and force tasks of each cell.
                    
                \item<+-> Each \alert<.>{sorting task} depends on the sorting tasks
                    of its sub-cells (merge-sort).
                    
                \item<+-> Each \alert<.>{pair-interaction} task depends on the sort tasks
                    of the cells involved.
                    
                \item<+-> Tasks on \alert<.>{overlapping cells conflict}, i.e.~they can
                    not execute concurrently.

            \end{itemize}
            
            \column{0.35\textwidth}
                \only<2>{\centerline{\epsfig{file=Hierarchy2.pdf,height=0.8\textheight}}}%
                \only<3>{\centerline{\epsfig{file=Hierarchy2_005.pdf,height=0.8\textheight}}}%
                \only<4>{\centerline{\epsfig{file=Hierarchy2_007.pdf,height=0.8\textheight}}}%
                \only<5>{\centerline{\epsfig{file=Hierarchy2_006.pdf,height=0.8\textheight}}}%
                \only<6>{\centerline{\epsfig{file=Hierarchy2_008.pdf,height=0.8\textheight}}}%
                \only<7>{\centerline{\epsfig{file=Hierarchy2_001.pdf,height=0.8\textheight}}}%
                \only<8>{\centerline{\epsfig{file=Hierarchy2_002.pdf,height=0.8\textheight}}}%
                \only<9>{\centerline{\epsfig{file=Hierarchy2_004.pdf,height=0.8\textheight}}}%
         
        \end{columns}
    \end{frame}
    
    
    \subsection{Dynamic task allocation}
    
    \begin{frame}
    
        \centerline{\epsfig{file=tasks_dynamic.pdf,width=\textwidth}}
        
        \pause
        
        \begin{itemize}
        
            \item<+-> Each core has it's own task queue and uses
                \alert<.>{work-stealing} when empty.
                
            \item<+-> Each core has a \alert<.>{preference} for tasks involving
                cells which were used previously to improve cache re-use.

        \end{itemize}
    \end{frame}
    
    
    \subsection{Parallel efficiency and scaling}
    
    \begin{frame}
    
        \pause
        
        \centerline{\epsfig{file=scaling.pdf,width=0.8\textwidth}}
        
        \pause
        
        \begin{itemize}
        
            \item<+-> Parallel efficiency of \alert<.>{$75\%$} on 32 cores of
                an $4\times$ Intel Xeon X7550 shared-memory machine.
                
            \item<+-> \alert<.>{No NUMA-related effects}: Each
                task operates exclusively on a small contiguous region
                of memory which usually
                fits in the \alert<+>{lower level caches}.

        \end{itemize}
    \end{frame}
    
    
    \subsection{Further work}
    
    \begin{frame}
        \begin{itemize}
        
            \pause
            
            \item<+-> {\sc swift} is currently an Open-Source \alert<.>{research
                code}, but we aim to be able to do \alert<+>{production runs}
                in Computational Cosmology.
                
            \item<+-> \alert<.>{Modular structure}, easy to add
                new physics, kernel functions, etc\dots
        
            \item<+-> \alert<.>{Hybrid} shared/distributed-memory parallelism:
            
                \begin{itemize}
                    \item<+-> \alert<.>{Distributed-memory parallelism} between the
                        nodes of a cluster,.
                    \item<+-> \alert<.>{Task-based shared-memory parallelism} between
                        the cores of a single node.
                    \item<+-> \alert<.>{SIMD/SIMT parallelism} within each task
                        on a single core.
                \end{itemize}
                
            \item<+-> In a task-based hybrid setup, sending and receiving
                data asynchronously can be implemented as tasks to
                \alert<.>{hide communication latencies}.
                
            \item<+-> \alert<.>{Mixed-precision floating point arithmetic}
                in order to better exploit SIMD vectorization, yet
                \alert<+>{without losing precision}.
                
            \item<+-> Tasks can also be \alert<.>{shared between the
                CPU and a GPU}.
                
        \end{itemize}
    \end{frame}
    
    
    \section{Conclusions}
    \subsection{Take-home messages}
    
    \begin{frame}
        \begin{itemize}
        
            \item<+-> In order to continue getting {\em faster}, programs
                need to become {\em more parallel}.
                
                % \vspace{0.5ex}
                
                $\longrightarrow$
                    If a program has reached its maximum degree of parallelism,
                    it won't get any faster. Ever.
                
                \vspace{0.5ex}
                
            \item On shared-memory systems,
                asynchronous task-based parallelism solves most
                problems with concurrency and scaling.
                    
                % \vspace{0.5ex}
                
                $\longrightarrow$
                    But we still need to develop task-based algorithms
                    for specific problems, e.g.~SPH simulations.
                
                \vspace{0.5ex}
                
            \item Better algorithms alone can lead to speedups
                of up to a factor of ten.
                
                % \vspace{0.5ex}
                
                $\longrightarrow$
                    Better use of both existing and future
                    infrastructure.
                    
            \item<+-> If you have an \alert<.>{interesting problem}, or
                just need a \alert<+>{fast code}, let us know!
                    
        \end{itemize}
    \end{frame}
        
    
    \subsection{Thanks}
    
    \begin{frame}
        \centerline{Thank you for your attention!}
    \end{frame}


\end{document}