diff --git a/paper/figures/QSched.pdf b/paper/figures/QSched.pdf index 1945befe7430ab0d1b679acac21dccb52c003f4d..07d6a694d4a8aa9d9c910fa6251e0716a75fe047 100644 Binary files a/paper/figures/QSched.pdf and b/paper/figures/QSched.pdf differ diff --git a/paper/figures/QSched.svg b/paper/figures/QSched.svg index 16fef7155135b6613fec2be8eac55ba337e59704..c4351a5bea46c3e9b6baae67edce925316af4bd7 100644 --- a/paper/figures/QSched.svg +++ b/paper/figures/QSched.svg @@ -51,14 +51,14 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="1" - inkscape:cx="331.95424" - inkscape:cy="480" + inkscape:zoom="2" + inkscape:cx="508.11752" + inkscape:cy="548.79621" inkscape:document-units="px" inkscape:current-layer="layer1" showgrid="true" - inkscape:window-width="1280" - inkscape:window-height="753" + inkscape:window-width="2560" + inkscape:window-height="1393" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1"> @@ -74,7 +74,7 @@ <dc:format>image/svg+xml</dc:format> <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> + <dc:title /> </cc:Work> </rdf:RDF> </metadata> @@ -636,5 +636,38 @@ id="tspan7789" x="384" y="632.36218">unlock</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3056" + y="436.36218" + x="654.57031" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold" + xml:space="preserve"><tspan + y="436.36218" + x="654.57031" + id="tspan3058" + sodipodi:role="line">thread 0</tspan></text> + <text + xml:space="preserve" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold" + x="654.57031" + y="496.36218" + id="text3060" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3062" + x="654.57031" + y="496.36218">thread 1</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3064" + y="556.36218" + x="654.57031" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold" + xml:space="preserve"><tspan + y="556.36218" + x="654.57031" + id="tspan3066" + sodipodi:role="line">thread 2</tspan></text> </g> </svg> diff --git a/paper/figures/TaskWeight.pdf b/paper/figures/TaskWeight.pdf index 68138a3c630a68295aa440ec325bab856320e84b..20ba81c40739ca83f5a139657d5abc89c4264cc6 100644 Binary files a/paper/figures/TaskWeight.pdf and b/paper/figures/TaskWeight.pdf differ diff --git a/paper/figures/TaskWeight.svg b/paper/figures/TaskWeight.svg index 4393ed8823fdd88e41d5846545bf63bed17025a3..b854aa3b2ee0be594481dac83eef5135049c7644 100644 --- a/paper/figures/TaskWeight.svg +++ b/paper/figures/TaskWeight.svg @@ -89,14 +89,14 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="1" - inkscape:cx="249.5" - inkscape:cy="522.89698" + inkscape:zoom="2" + inkscape:cx="295.8911" + inkscape:cy="562.92883" inkscape:document-units="px" inkscape:current-layer="layer1" showgrid="true" - inkscape:window-width="1366" - inkscape:window-height="721" + inkscape:window-width="2560" + inkscape:window-height="1393" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1"> @@ -223,20 +223,29 @@ id="tspan5715" x="-371.12311" y="478.59158" - style="font-size:20px">cost</tspan></text> + style="font-size:20px">cost(A)</tspan></text> <text xml:space="preserve" - style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" - x="-547.51123" - y="533.49603" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + x="525.60541" + y="539.85498" id="text5717" - sodipodi:linespacing="125%" - transform="matrix(0,-1,1,0,0,0)"><tspan + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + x="525.60541" + y="539.85498" + style="font-size:20px;text-align:start;text-anchor:start" + id="tspan3201">weight(A) = cost(A) + </tspan><tspan sodipodi:role="line" - id="tspan5719" - x="-547.51123" - y="533.49603" - style="font-size:20px">weight</tspan></text> + x="525.60541" + y="564.85498" + style="font-size:20px;text-align:start;text-anchor:start" + id="tspan3205"> max{weight(B), </tspan><tspan + sodipodi:role="line" + x="525.60541" + y="589.85498" + style="font-size:20px;text-align:start;text-anchor:start" + id="tspan3207"> weight(D)}</tspan></text> <path style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)" d="m 228.0315,308.26769 0,478.34646" @@ -255,5 +264,77 @@ x="-547.64691" y="222.03149" style="font-size:20px">time</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3209" + y="377.56561" + x="355.08435" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + xml:space="preserve"><tspan + style="font-size:20px" + y="377.56561" + x="355.08435" + id="tspan3211" + sodipodi:role="line">A</tspan></text> + <text + xml:space="preserve" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + x="300.77371" + y="501.85693" + id="text3213" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3215" + x="300.77371" + y="501.85693" + style="font-size:20px">B</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3217" + y="589.84717" + x="301.13992" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + xml:space="preserve"><tspan + style="font-size:20px" + y="589.84717" + x="301.13992" + id="tspan3219" + sodipodi:role="line">C</tspan></text> + <text + xml:space="preserve" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + x="406.5979" + y="545.14832" + id="text3221" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3223" + x="406.5979" + y="545.14832" + style="font-size:20px">D</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3225" + y="677.85693" + x="352.81277" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + xml:space="preserve"><tspan + style="font-size:20px" + y="677.85693" + x="352.81277" + id="tspan3227" + sodipodi:role="line">E</tspan></text> + <text + xml:space="preserve" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" + x="458.92017" + y="721.14832" + id="text3229" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3231" + x="458.92017" + y="721.14832" + style="font-size:20px">F</tspan></text> </g> </svg> diff --git a/paper/paper.tex b/paper/paper.tex index 7cdf9c30eff11a9769323966814a9cac1b1d6f9a..cfa77730fa21a20aff817cd688a93f4574920395 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -66,7 +66,8 @@ programming with the concept of task conflicts, i.e.~sets of tasks that can be executed in any order, yet not concurrently. These conflicts are modelled using exclusively lockable hierarchical resources. -The scheduler is shown to perform and scale well on a 64-core parallel +The scheduler itself prioritizes tasks along the critical path +of execution and is shown to perform and scale well on a 64-core parallel shared-memory machine for two example problems: A tiled QR decomposition and a task-based Barnes-Hut tree code. \end{abstract} @@ -108,8 +109,8 @@ Several factors may limit the maximum degree of parallelism, e.g.~the structure of the task dependency DAG itself, or the order in which available tasks are executed. -\fig{Tasks} shows such a DAG for a set of tasks. -The arrows indicate the direction of the dependency, i.e.~an +\fig{Tasks} shows such a DAG for a set of tasks with +arrows indicating the direction of the dependencies, i.e.~an arrow from task $A$ to task $B$ indicates that task $B$ depends on task $A$. In a parallel setting, tasks $A$, $G$, and $J$ can be @@ -196,6 +197,14 @@ this problem in their respective implementations of the Fast Multipole Method (FMM), in which forces computed in different tasks are accumulated on a set of particles. +Several libraries provide some mechanism to model such +conflicts, either directly or indirectly. +In the QUARK scheduler, conflicts can be modeled by explicitly +marking dependencies as concurrent. +KAAPI and OmpSS, on the other hand, allow marking access to +certain variables as reductions, yet only for basic operations, +e.g.~summation or maximum/minimum. + This paper presents QuickSched, a framework for task-based parallel programming with constraints, which aims to achieve the following goals: @@ -207,7 +216,8 @@ the following goals: \item {\em Memory/cache efficiency}: Tasks accessing similar sets of data should be preferentially executed on the same core to preserve memory/cache locality as far as possible, and - \item {\em Parallel efficiency}: Tasks should be executed in an order + \item {\em Parallel efficiency}: The order in which the tasks + are executed should be chosen such that sufficient work is available for all computational threads at all times. \end{itemize} @@ -248,9 +258,9 @@ From a programmer's perspective, there are two main paradigms for generating task dependencies: \begin{itemize} \item Implicitly via spawning and waiting, e.g.~as is done in Cilk - \citep{ref:Blumofe1995}, or - \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs - \citep{ref:Duran2011}. + and OpenMP~3.0, or + \item Automatic extraction from data dependencies, e.g.~as is done in + StarPU, QUARK, and OmpSs. \end{itemize} The first scheme, spawning and waiting, is arguably the simplest to @@ -382,6 +392,9 @@ The parameters controlling the size of the tasks in the examples, i.e.~the tile size in the QR decomposition and the limits $n_\mathsf{max}$ and $n_\mathsf{task}$ were determined empirically and only optimized to the closest power of two or rough power of ten, respectively. +Further tuning these parameters could very likely lead to further +performance gains, but such an effort would go beyond the scope, +and point, of this paper. \section{Data Structures and Algorithms} @@ -467,7 +480,7 @@ struct task { and {\tt uses} arrays are pointers to the contents of other arrays, i.e.~they are not allocated individually. -What the task does is determined by the {\tt type} +{\em What} the task does is determined by the {\tt type} field, e.g.~which can be mapped to any particular function, and the {\tt data} pointer which points to an array of {\tt size\_data} bytes containing data specific to the task, @@ -507,7 +520,8 @@ The task cost can be either a rough estimate provided by the user, or the actual cost of the same task last time it was executed. The task weights are computed by traversing the tasks DAG in reverse topological order following their dependencies, -e.g.~as per \cite{ref:Kahn1962}, and computing each task's weight, e.g. +e.g.~as per \cite{ref:Kahn1962} in $\mathcal O(n)$ for $n$ tasks, +and computing each task's weight, e.g. \begin{equation*} \mbox{weight}_i = \mbox{cost}_i + \max_{j \in \mbox{\small unlocks}_i}\left\{\mbox{weight}_j\right\}. \end{equation*} @@ -669,7 +683,7 @@ operations for both insertion and deletion, i.e. for the bubble-up and trickle-down operations respectively. Unfortunately, there is no way of efficiently traversing all -the elements in the heap in decreasing order. +the elements in such a heap in decreasing order. The array of tasks is therefore traversed as if it were sorted, returning the first task that can be locked. Although the first task in the array will be the task with @@ -735,7 +749,7 @@ struct task *queue_get(struct queue *q) { } \end{lstlisting} \end{minipage}\end{center} -\noindent where as with the queue insertion, the queue is first +\noindent where, as with the queue insertion, the queue is first locked for exclusive access (line~4). The array of task pointers is then traversed (line~5), locking the resources of each task (lines~6--7). @@ -745,8 +759,8 @@ are released (lines~9--10), otherwise, the traversal is aborted If all the locks on a task could be obtained (line~14), the task pointer is replaced by the last pointer in the heap (line~16) and the heap order is restored (line~17). -Finally, the queue lock is released (line~19) and the locked task -or, if no lockable task could be found, {\tt NULL} is returned. +Finally, the queue lock is released (line~19) and the locked task, +or {\tt NULL} if no lockable task could be found, is returned. Note that this approach of sequentially locking multiple resources is prone to the so-called ``dining philosophers'' problem, i.e.~if @@ -757,14 +771,15 @@ This type of deadlock, however, is easily avoided by sorting the resources in each task according to some global criteria, e.g.~the resource ID or the address in memory of the resource. -Note also that although protecting the entire queue with a mutex -is not particularly scalable, and several authors, e.g.~REFS, +Note also that protecting the entire queue with a mutex +is not particularly scalable, and several authors, e.g.~\cite{ref:Sundell2003}, have presented concurrent data structures that avoid this type of locking. However, since we normally use one queue per computational thread, contention will only happens due to work-stealing, i.e.~when another idle computational thread tries to poach tasks. Since this happens only rarely, we opt for the simpler locking approach. +This decision is backed by the results in Section~5. \subsection{Scheduler} @@ -814,7 +829,8 @@ in which the code between lines~4 and~11 is executed concurrently. A version using {\tt pthreads} \citep{ref:Pthreads1995} directly\footnote{In most environments, OpenMP is implemented -on top of {\tt pthreads}, e.g. gcc's libgomp.} is also available. +on top of {\tt pthreads}, e.g. the {\tt gcc} compiler's libgomp.} +is also available. The parallel section consists of a loop (lines~7--10) in which a task is acquired via {\tt qsched\_gettask} and its type and data are passed to a user-supplied diff --git a/paper/quicksched.bib b/paper/quicksched.bib index 0908731b3859e2cf9152b8e7cd10ae80929c66ae..99cbdd0165c134dc0287fa5bf1ddd4bf7b6f04a2 100644 --- a/paper/quicksched.bib +++ b/paper/quicksched.bib @@ -1,3 +1,12 @@ +@inproceedings{ref:Sundell2003, + title={Fast and lock-free concurrent priority queues for multi-thread systems}, + author={Sundell, H{\aa}kan and Tsigas, Philippas}, + booktitle={Parallel and Distributed Processing Symposium, 2003. Proceedings. International}, + pages={11--pp}, + year={2003}, + organization={IEEE} +} + @article{ref:Barnes1986, title={A hierarchical O (N log N) force-calculation algorithm}, author={Barnes, Josh and Hut, Piet},