\documentclass{beamer} % \documentclass[handout]{beamer} %% Some packages \usepackage{epsfig} \usepackage{algorithmic} \usepackage{array} \usepackage{eurosym} \usepackage{listings} \lstset{basicstyle=\small\tt} \usepackage[orientation=landscape,size=custom,width=16.8,height=10.5,scale=0.5]{beamerposter} %\usepackage[T1]{fontenc} %% Some macros \newcommand\dx{\,\mbox{d}x} %% Table column types \newcommand\PBS[1]{\let\temp=\\% #1% \let\\=\temp} \newcolumntype{x}[1] {>{\PBS\raggedright\hspace{0pt}}p{#1}} %% ETH font \renewcommand{\familydefault}{let} \renewcommand{\sfdefault}{let} %\setbeamerfont{normal text}{family=\sffamily} %% Colors \xdefinecolor{grey}{rgb}{0.95,0.95,0.9375} \xdefinecolor{myblue}{rgb}{0.664,0.762,0.867} \xdefinecolor{myred}{rgb}{1.0,0.0,0.328} \setbeamercolor{background canvas}{fg=black,bg=grey} \setbeamercolor{normal text}{fg=black,bg=grey} \setbeamercolor{itemize item}{fg=myblue} \setbeamercolor{itemize subitem}{fg=myblue} \setbeamercolor{itemize subsubitem}{fg=myblue} \setbeamercolor{alert}{fg=myred} \setbeamercolor{section in head/foot}{fg=white,bg=black} \setbeamerfont{section in head/foot}{size=\LARGE} \setbeamercolor{subsection in head/foot}{fg=white,bg=black} \setbeamerfont{subsection in head/foot}{size=\normalsize} \setbeamercolor{title in head/foot}{fg=white,bg=black} \setbeamerfont{title in head/foot}{size=\tiny} \setbeamercolor{date in head/foot}{fg=white,bg=black} \setbeamerfont{date in head/foot}{size=\tiny} \setbeamerfont{itemize subitem}{size=\normalsize} \setbeamerfont{footnote}{size=\tiny} %% Margins \setbeamersize{text margin left=3ex} \setbeamersize{text margin right=5ex} \setbeamersize{sidebar width left=0ex} \setbeamersize{sidebar width right=0ex} %% Define the bullet \setbeamertemplate{itemize item}[square] %% Define the Headline % \setbeamertemplate{headline}{ % \begin{beamercolorbox}[wd=0.8\paperwidth,ht=8ex,dp=2ex,leftskip=3ex]{section in head/foot} % \usebeamerfont{section in head/foot}\insertsection % \end{beamercolorbox} % \begin{beamercolorbox}[wd=0.8\paperwidth,ht=4ex,dp=3.5ex,leftskip=3ex,rightskip=3ex]{subsection in head/foot} % \usebeamerfont{subsection in head/foot}\insertsubsection % \end{beamercolorbox} % \begin{beamercolorbox}[right,wd=0.2\paperwidth,ht=12ex,dp=2ex,rightskip=3ex]{section in head/foot} % \includegraphics[width=10ex]{logo.pdf} % \end{beamercolorbox} % } \setbeamertemplate{headline}{ \hbox{\begin{beamercolorbox}[wd=0.75\paperwidth,ht=14ex]{} \begin{beamercolorbox}[wd=0.75\paperwidth,ht=8ex,dp=2ex,leftskip=3ex]{section in head/foot} \usebeamerfont{section in head/foot}\insertsection \end{beamercolorbox} \begin{beamercolorbox}[wd=0.75\paperwidth,ht=4ex,dp=3.5ex,leftskip=3ex,rightskip=3ex]{subsection in head/foot} \usebeamerfont{subsection in head/foot}\insertsubsection \end{beamercolorbox}\end{beamercolorbox}% \begin{beamercolorbox}[right,wd=0.25\paperwidth,ht=14ex,dp=3.5ex,rightskip=3ex]{section in head/foot} \vspace{-1ex}\includegraphics[height=12ex]{DU_W_O_med.png} \end{beamercolorbox}} } %% Define the Footline \setbeamertemplate{footline}{ \hbox{\begin{beamercolorbox}[wd=0.7\paperwidth,ht=3ex,dp=1.4ex,leftskip=3ex]{title in head/foot} \usebeamerfont{title in head/foot} \insertauthor: \inserttitle \end{beamercolorbox}% \begin{beamercolorbox}[right,wd=0.3\paperwidth,ht=3ex,dp=1.4ex,rightskip=3ex]{date in head/foot} \usebeamerfont{date in head/foot}\insertdate\hspace{1ex}\insertframenumber/\inserttotalframenumber \end{beamercolorbox}} } %% Title \title{Algorithms for shared-memory parallel high-performance computing} \subtitle{Or how to get more out of existing hardware today} \author{Pedro Gonnet} \date{May 30, 2013} %% Slides \begin{document} { \setbeamercolor{background canvas}{fg=white,bg=black} % \usebackgroundtemplate{\includegraphics[width=\paperwidth]{eagle_50.png}} \section{Title} \begin{frame}[plain] \color{white} \vspace{15ex} {\bf\huge \inserttitle \\} \vspace{1.5ex} \Large \insertsubtitle \\ \vspace{4ex} \large \insertauthor \\ \vspace{0.2ex} School of Engineering and Computing Sciences, Durham University \\ \vspace{0.2ex} Universit\"at Basel, \insertdate \\ \end{frame} } \setbeamercolor{background canvas}{fg=black,bg=grey} \section{Take-home messages} \subsection{What my research is all about} \begin{frame} \begin{itemize} \pause \item<+-> In order to continue getting \alert<.>{\em faster}, programs need to become \alert<+>{\em more parallel}. \vspace{0.5ex} \uncover<+->{$\longrightarrow$ If a program has reached its \alert<.>{maximum degree of parallelism}, it won't get any faster.} \uncover<+->{\alert<.>{Ever}.} \vspace{0.5ex} \item<+-> On shared-memory systems, asynchronous \alert<.>{task-based parallelism} solves most problems with concurrency and scaling. \uncover<+->{\alert<.>{Even on GPUs}.} \vspace{0.5ex} \uncover<+->{$\longrightarrow$ But we still need to develop task-based algorithms for \alert<.>{specific problems}.} \vspace{0.5ex} \item<+-> My own work in Molecular Dynamics and Smoothed Particle Hydrodynamics simulations show that speedups of \alert<.>{more than a factor of ten} are possible. \vspace{0.5ex} \uncover<+->{$\longrightarrow$ Better use of both \alert<.>{existing} and \alert<+>{future} infrastructure.} \vspace{0.5ex} \item<+-> The goal is to free HPC from the grip of expensive clusters and specialized hardware, thus making it \alert<.>{more accessible} to a wider audience of users. \end{itemize} \end{frame} \section{Parallel programming paradigms} \subsection{Distributed-memory parallelism} \begin{frame} \pause \begin{columns}[c] \column{0.6\textwidth} \begin{itemize} \item<+-> \alert<.>{Distributed-memory parallelism}, e.g.~using MPI, is based on \alert<+>{data decomposition}, i.e.~each processor is assigned part of the problem to work on and communicates with its neighbours. \item<+-> \alert<.>{Surface-to-volume ratio problem}: As the number of cores increases, the amount of \alert<+>{computation per core (volume) decreases} while the relative amount of \alert<+>{communication (surface) increases}. \item<+-> The entire computation is eventually \alert<.>{dominated by communication}, thus seriously degrading parallel efficiency. \end{itemize} \column{0.4\textwidth} \uncover<2>{\only<-2>{\centerline{\epsfig{file=MPIScaling001.pdf,width=0.65\textwidth}}}}% \only<3-4>{\centerline{\epsfig{file=MPIScaling003.pdf,width=0.65\textwidth}}}% % \only<5>{\centerline{\epsfig{file=MPIScaling002.pdf,width=0.65\textwidth}}}% \only<5->{\centerline{\epsfig{file=MPIScaling004.pdf,width=0.65\textwidth}}}% \end{columns} \end{frame} \subsection{OpenMP} \begin{frame} \pause \begin{columns}[c] \column{0.6\textwidth} \begin{itemize} \item<+-> \alert<.>{Shared-memory parallelism} using OpenMP, i.e.~\alert<+>{annotating an inherently serial code}, is often hampered by \alert<+>{frequent synchronization}. \item<+-> \alert<.>{Concurrency problems} need to be addressed explicitly, e.g.~using barriers or atomic instructions. \item<+-> These costs associated with these two problems \alert<.>{only get worse} as the number of cores increases. \end{itemize} \column{0.4\textwidth} \only<2>{\centerline{\epsfig{file=OMPCode_003.pdf,width=0.7\textwidth}}}% \only<3>{\centerline{\epsfig{file=OMPCode_002.pdf,width=0.7\textwidth}}}% \only<4>{\centerline{\epsfig{file=OMPScaling003.pdf,width=0.7\textwidth}}}% \only<5->{\centerline{\epsfig{file=OMPCode_001.pdf,width=0.7\textwidth}}}% \end{columns} \end{frame} \subsection{Case in point} \begin{frame} \begin{itemize} \pause \item<+-> \alert<.>{Cosmological simulation} (galaxy formation) with \alert<+>{1.8\,M particles} in a cubic box of 6.25\,Mpc on a 4~$\times$ Intel Xeon X7550 with \alert<+>{32 cores}, 2\,GHz. \item<+-> \alert<.>{{\sc gadget}-2}, MPI-based, used for multi-billion particle simulations. \item<+-> \alert<.>{\sc swift}, my own code for the same problem. % developed with the Institute of Computational Cosmology at % Durham University. % \uncover<+->{\alert<.>{13$\times$ faster} on 32 cores.} \end{itemize} \vspace{1ex} \uncover<5->{\only<-5>{\centerline{\epsfig{file=scaling_gadget.pdf, width=0.9\textwidth}}}}% \only<6>{\centerline{\epsfig{file=scaling.pdf, width=0.9\textwidth}}}% \only<7->{\centerline{\epsfig{file=scaling_red.pdf, width=0.9\textwidth}}}% \end{frame} \subsection{Task-based parallelism} \begin{frame} \begin{itemize} \pause \item<+-> \alert<.>{Shared-memory parallel programming paradigm} in which the computation is formulated in an \alert<+>{implicitly parallelizable} way that automatically avoids most of the problems associated with \alert<+>{concurrency and load-balancing}. \end{itemize} \vspace{-1.5ex} \begin{columns} \column{0.55\textwidth} \begin{itemize} \item<+-> We first reduce the problem to a set of inter-dependent \alert<.>{tasks}. \item<+-> For each task, we need to know: \begin{itemize} \item<+-> Which tasks it \alert<.>{depends} on, \item<+-> Which tasks it \alert<.>{conflicts} with. \end{itemize} \item<+-> Each thread then \alert<.>{picks up a task} which has no unresolved dependencies or conflicts and computes it. \end{itemize} \column{0.45\textwidth} \only<5>{\centerline{\epsfig{file=img008.pdf,width=0.9\textwidth}}}% \only<6>{\centerline{\epsfig{file=img008h.pdf,width=0.9\textwidth}}}% \only<7>{\centerline{\epsfig{file=img008b.pdf,width=0.9\textwidth}}}% \only<8>{\centerline{\epsfig{file=img008c.pdf,width=0.9\textwidth}}}% \only<9>{\centerline{\epsfig{file=img008.pdf,width=0.9\textwidth}}}% \only<10>{\centerline{\epsfig{file=img008d.pdf,width=0.9\textwidth}}}% \only<11>{\centerline{\epsfig{file=img008e.pdf,width=0.9\textwidth}}}% \only<12>{\centerline{\epsfig{file=img008e2.pdf,width=0.9\textwidth}}}% \only<13>{\centerline{\epsfig{file=img008f.pdf,width=0.9\textwidth}}}% \only<14>{\centerline{\epsfig{file=img008f2.pdf,width=0.9\textwidth}}}% \end{columns} \end{frame} \begin{frame} \begin{itemize} \pause \item<+-> The order in which the tasks are processed is \alert<.>{dynamic} and adapts automatically to load imbalances. \item<+-> If the dependencies and conflicts are specified correctly, we \alert<.>{do not have to worry about concurrency} at the level of the individual tasks. \uncover<+->{$\longrightarrow$ No need for expensive \alert<.>{explicit} locking, synchronization, or atomic operations.} \item<+-> Each task has exclusive access to the data it is working on, thus \alert<.>{improving cache efficiency}. \item<+-> The same approach can be applied to more \alert<.>{unconventional many-core systems} such as GPUs. \item<+-> However, this usually means that we have to \alert<.>{re-think our entire computation}, e.g.~redesign it from scratch to make it task-based. \end{itemize} \end{frame} \subsection{Task-based parallelism on GPUs} \begin{frame} \begin{itemize} \pause \item<+-> Although general-purpose \alert<.>{Graphics Processing Units} (GPUs) are usually treated as \alert<+>{large vector machines}, the hardware is more reminiscent of a \alert<+>{vectorized multi-core}. \item<+-> Each GPU is split into several \alert<.>{\em Streaming Multiprocessors} which execute several \alert<+>{\em blocks} of threads independently. \item<+-> It is therefore possible to use task-based parallelism in which \alert<.>{every block works on a separate task} with a small number of threads. \uncover<+->{$\longrightarrow$ Opens up the GPU for \alert<.>{inhomogeneous} workloads.} \item<+-> Requires extremely efficient, non-blocking data structures for the \alert<.>{task queue}. \item<+-> Instead of \alert<.>{vectorizing the entire computation}, we only have to \alert<+>{vectorize each task} independently. \end{itemize} \end{frame} \subsection{My work in this area} \begin{frame} \begin{itemize} \pause % \item<+-> \alert<.>{Task-based computational models}, % i.e.~develop memory-efficient, task-based algorithms % for specific computational problems. \item<+-> \alert<.>{Algorithms for task-based parallelism}, i.e.~task sorting, task clustering, task selection, and efficient task queues. \item<+-> \alert<.>{Task-based parallelism on GPUs}, i.e.~specific data-structures for fast, non-blocking queues. \item<+-> \alert<.>{New architectures}, i.e.~algorithms for emerging high-performance/low-cost many-core architectures, e.g.~the Intel~Phi, or Adapteva's Epiphany~IV. \item<+-> \alert<.>{Hybrid shared/distributed-memory schemes}, i.e.~use both MPI and task-based parallelism on clusters of multi-cores, interleaving communication and computation such as to hide communication latencies. \end{itemize} \end{frame} \section{Algorithms for SPH} \subsection{Smoothed Particle Hydrodynamics} \begin{frame} \pause \begin{columns} \column{0.65\textwidth} \begin{itemize} \item<+-> \alert<.>{Smoothed Particle Hydrodynamics} (SPH) is a computational method for simulating fluid flows and is used in many areas, e.g.~in Astrophysics or avalanche simulations. \item<+-> Fluids are modelled using \alert<.>{particles} which interact with their neighbours in a two-stage process: \begin{itemize} \item<+-> Compute the particle \alert<.>{density}, \item<+-> Compute the particle \alert<.>{forces}. \end{itemize} \item<+-> In each step, the \alert<.>{neighbours} of each particle within its smoothing length $h_i$ need to be identified. \end{itemize} \column{0.35\textwidth} \only<3-5>{\centerline{\epsfig{file=SPH_001.pdf,width=0.7\textwidth}}}% \only<6->{\centerline{\epsfig{file=SPH_002.pdf,width=0.7\textwidth}}}% \uncover<4->{\begin{eqnarray*} \alert<4>{\rho_i} & = & \sum_{\alert<6>{r_{ij}{\mathbf f_i} & = & -\sum_{\alert<6>{r_{ij} \alert<.>{\alert<+>{Spatial trees}} are the most commonly used approach to neighbour-finding, as the particle distribution can be irregular. \item<+-> Neighbour-finding up and down the tree is \alert<.>{\alert<+>{simple}}, but has some problems: \begin{itemize} \item<+-> Worst-case cost in \alert<.>{$\mathcal O(N^{2/3})$} per particle. \item<+-> \alert<.>{Low cache efficiency} due to scattered memory access. \item<+-> Symmetries cannot be exploited, i.e.~each particle pair is \alert<.>{found twice}. \end{itemize} \item<+-> Parallelization is \alert<.>{trivial}, but only because symmetries are not exploited. \end{itemize} \column{0.35\textwidth} \only<1>{\centerline{\epsfig{file=Octree.pdf,width=0.8\textwidth}}}% \only<2>{\centerline{\epsfig{file=SearchTree_001.pdf,width=0.8\textwidth}}}% \only<3>{\centerline{\epsfig{file=SearchTree_002.pdf,width=0.8\textwidth}}}% \only<4->{\centerline{\epsfig{file=SearchTree_003.pdf,width=0.8\textwidth}}}% \end{columns} \end{frame} \subsection{Hierarchical cell pairs} \begin{frame} \pause \begin{columns} \column{0.65\textwidth} \begin{itemize} \item<+-> We start by splitting the simulation domain into rectangular \alert<.>{cells} of edge length at least $h_\mathsf{max}$. \item<+-> All interacting particle pairs are then in either in the \alert<.>{same cell}, or in a \alert<+>{pair of neighbouring cells}. \item<+-> Finding the neighbours between each such cell or pair of cells can be used as a \alert<.>{task}. \item<+-> If the particles in the cell or cell pair are sufficiently small, the task can be \alert<.>{split}. \item<+-> Finally, the particles in each cell pair are \alert<.>{first sorted} along the cell pair axis to speed-up neighbour-finding. \end{itemize} \column{0.35\textwidth} \only<2>{\centerline{\epsfig{file=InitialDecomp_001.pdf,width=0.8\textwidth}}}% \only<3-4>{\centerline{\epsfig{file=InitialDecomp_002.pdf,width=0.8\textwidth}}}% \only<5>{\centerline{\epsfig{file=InitialDecomp_004.pdf,width=0.8\textwidth}}}% \only<6>{\centerline{\epsfig{file=SplitCell.pdf,width=0.8\textwidth}} \vspace{2ex} \centerline{\epsfig{file=SplitPair.pdf,width=\textwidth}}}% \only<7>{\centerline{\epsfig{file=SortedInteractions.pdf,width=0.8\textwidth}}}% \end{columns} \end{frame} \subsection{Task hierarchy} \begin{frame} \pause \begin{columns} \column{0.65\textwidth} \begin{itemize} \item<+-> \alert<.>{Three main task types}: \alert<+>{Sorting}, \alert<+>{self-interactions}, and \alert<+>{pair-interactions}. \item<+-> \alert<.>{``Ghost'' tasks} are added to group dependencies between the density and force tasks. \item<+-> Each \alert<.>{sorting task} depends on the sorting tasks of its sub-cells (merge-sort). \item<+-> Each \alert<.>{pair-interaction} task depends on the sort tasks of the cells involved. \item<+-> Tasks on \alert<.>{overlapping cells conflict}, i.e.~they can not execute concurrently. \end{itemize} \column{0.35\textwidth} \only<2>{\centerline{\epsfig{file=Hierarchy2.pdf,height=0.8\textheight}}}% \only<3>{\centerline{\epsfig{file=Hierarchy2_005.pdf,height=0.8\textheight}}}% \only<4>{\centerline{\epsfig{file=Hierarchy2_007.pdf,height=0.8\textheight}}}% \only<5>{\centerline{\epsfig{file=Hierarchy2_006.pdf,height=0.8\textheight}}}% \only<6>{\centerline{\epsfig{file=Hierarchy2_008.pdf,height=0.8\textheight}}}% \only<7>{\centerline{\epsfig{file=Hierarchy2_001.pdf,height=0.8\textheight}}}% \only<8>{\centerline{\epsfig{file=Hierarchy2_002.pdf,height=0.8\textheight}}}% \only<9>{\centerline{\epsfig{file=Hierarchy2_004.pdf,height=0.8\textheight}}}% \end{columns} \end{frame} \subsection{Dynamic task allocation} \begin{frame} \centerline{\epsfig{file=tasks_dynamic.pdf,width=\textwidth}} \pause \begin{itemize} \item<+-> Each core has it's own task queue and uses \alert<.>{work-stealing} when empty. \item<+-> Each core has a \alert<.>{preference} for tasks involving cells which were used previously to improve cache re-use. \end{itemize} \end{frame} \subsection{Parallel efficiency and scaling} \begin{frame} \pause \centerline{\epsfig{file=scaling.pdf,width=0.8\textwidth}} \pause \begin{itemize} \item<+-> Parallel efficiency of \alert<.>{$75\%$} on 32 cores of an $4\times$ Intel Xeon X7550 shared-memory machine. \item<+-> \alert<.>{No NUMA-related effects}: Each task operates exclusively on a small contiguous region of memory which usually fits in the \alert<+>{lower level caches}. \end{itemize} \end{frame} \subsection{A hierarchy of parallel methods} \begin{frame} \begin{itemize} \pause \item<+-> Many problems of interest will be \alert<.>{too large} to simulate on a single machine. \item<+-> \alert<.>{Hybrid} shared/distributed-memory parallelism: \begin{itemize} \item<+-> \alert<.>{Distributed-memory parallelism} between the nodes of a cluster, using a spatial decomposition. \item<+-> \alert<.>{Task-based shared-memory parallelism} between the cores of a single node. \item<+-> \alert<.>{SIMD/SIMT parallelism} within each task on a single core. \end{itemize} \item<+-> In a task-based hybrid setup, sending and receiving data asynchronously can be implemented as tasks to \alert<.>{hide communication latencies}. \item<+-> SIMD/SIMT parallelism is \alert<.>{very similar} for large vectors, e.g.~AVX2 on the Intel~Phi/MIC, and the threads of a GPU. \item<+-> Tasks can also be \alert<.>{shared between the CPU and a GPU}. \end{itemize} \end{frame} \subsection{My work in this area} \begin{frame} \begin{itemize} \pause \item<+-> These algorithms have all been implemented as part of \alert<.>{\sc swift}, an Open-Source platform independent library for SPH simulations. \item<+-> Ongoing development and \alert<.>{tight collaboration} with the Institute of Computational Cosmology in Durham. \uncover<+->{$\longrightarrow$ Goal is to produce not just \alert<.>{good algorithms}, but also a simulation code that they can use for their own research.} \item<+-> Use of \alert<.>{mixed-precision} floating point arithmetic and algorithms that \alert<.>{vectorize naturally} to better exploit SIMD/SIMT parallelism. \item<+-> Further work on \alert<.>{improving neighbour finding} and task hierarchy, as well as new algorithms for \alert<+>{long-range interactions}, e.g.~gravity. \item<+-> Currently working on porting {\sc swift} to CUDA-based \alert<.>{GPUs}. \end{itemize} \end{frame} \section{Molecular Dynamics simulations} \subsection{{\tt mdcore}} \begin{frame} \begin{itemize} \pause \item<+-> \alert<.>{\tt mdcore} is an Open-Source library for hybrid shared/distributed-memory parallel Molecular Dynamics (MD) simulations on both \alert<+>{multi-cores and GPUs}. \item<+-> Similar problem as SPH, yet instead of doing \alert<.>{millions of time steps on billions of particles}, MD involves doing \alert<+>{billions of time steps on millions of particles}. \uncover<+->{$\longrightarrow$ Fine-grained parallel scaling is \alert<.>{crucial}!} \item<+-> Currently being used as a \alert<.>{test-bed} for: \begin{itemize} \item<+-> \alert<.>{Asynchronous} distributed-memory parallelism within a task-based scheme. \item<+-> \alert<.>{Multiple GPU} setups. \item<+-> Better use of \alert<.>{SIMD/SIMT} parallelism. \item<+-> New architectures such as the \alert<.>{Intel Phi} or \alert<+>{Adapeva's Epiphany III}. \end{itemize} \item<+-> Despite being a research code, it is currently \alert<.>{faster than NAMD}, the most popular Open-Source MD code currently available. \end{itemize} \end{frame} \subsection{{\tt mdcore} on a GPU} \begin{frame} \begin{itemize} \pause \item<+-> {\tt mdcore} currently implements task-based parallelism \alert<.>{directly on GPUs}. \item<+-> For the standard JAC benchmark (23'558 atoms), the GPU code is \alert<.>{amost three times as fast} as a 16-core workstation. % \uncover<+->{And \alert<.>{costs about a tenth} as much.} \end{itemize} \vspace{1ex} \uncover<3->{\centerline{\epsfig{file=mdcore.pdf, width=0.9\textwidth}}}% \end{frame} \section{Interdisciplinary projects} \subsection{Creating useful collaborations} \begin{frame} \begin{itemize} \pause \item<+-> Tight coupling between \alert<.>{theory and practice}: all algorithms are implemented as part of \alert<+>{specific interdisciplinary applications}. \uncover<+->{$\longrightarrow$ There is a huge different between \alert<.>{knowing how} to solve a problem, and actually \alert<+>{solving it}.} \item<+-> Interdisciplinary projects require \alert<.>{close collaboration} with a counterpart in the domain of application such as to make the results \alert<+>{immediately useful}. \item<+-> Actual implementations require work, but the payoff can be massive in terms of interesting \alert<.>{new problems, contributions, and impact}. \end{itemize} \end{frame} \section{Teaching} \subsection{Parallel programming as a basic skill} \begin{frame} \begin{itemize} \pause \item<+-> Shared-memory parallelism is rapidly becoming a \alert<.>{ubiquitous reality}, both in high-performance and consumer products. \uncover<+->{$\longrightarrow$ Parallel programming should be an important part of any \alert<.>{core Computer Science curriculum}.} \item<+-> Understanding which parallel programming paradigms are \alert<.>{best suited} to which types of problems is of critical importance. \item<+-> Teaching needs to provide both a \alert<.>{solid theoretical foundation}, but also \alert<+>{hands-on expericence}, e.g. parallel programming labs. \item<+-> Include \alert<.>{new paradigms} such as SIMT programming with CUDA, many-core programming with OpenCL. \item<+-> Interdisciplinary projects and/or new hardware provide a number of interesting projects for \alert<.>{Bachelor/Master theses}. \end{itemize} \end{frame} \section{Conclusions} \subsection{In summary} \begin{frame} \begin{itemize} \item<+-> In order to continue getting \alert<.>{\em faster}, programs need to become \alert<+>{\em more parallel}. \item<+-> \alert<.>{Task-based parallelism} is a simple and efficient, cross-platform paradigm for shared-memory parallel computing. \uncover<+->{$\longrightarrow$ Algorithms for \alert<.>{specific problems} which exploit this paradigm are still needed.} \item<+-> \alert<.>{Doing things right} can lead to \alert<+>{massive gains in performance}. \uncover<+->{$\longrightarrow$ Not on some \alert<.>{fancy, expensive future hardware}, but on \alert<+>{existing infrastructure}.} \item<+-> Interdisciplinary collaborations provide \alert<.>{interesting new problems}. \uncover<+->{$\longrightarrow$ \alert<.>{Close collaboration} is necessary in order to produce software that can actually be used.} \end{itemize} \end{frame} \subsection{Thanks} \begin{frame} \centerline{Thank you for your attention!} \end{frame} \end{document}