diff --git a/paper/figures/Spawn.pdf b/paper/figures/Spawn.pdf index 263a78ebd1469b9299f37009926eca1d83faa697..a444386670c83f969883eb397d29298211ccac4f 100644 Binary files a/paper/figures/Spawn.pdf and b/paper/figures/Spawn.pdf differ diff --git a/paper/figures/Spawn.svg b/paper/figures/Spawn.svg index e58a69e9cbfe4f6f9291044b9eed9c7ae6e45e0b..2b0eea72cb09ea2c26b359adf20e2303dd39c257 100644 --- a/paper/figures/Spawn.svg +++ b/paper/figures/Spawn.svg @@ -95,7 +95,7 @@ <dc:format>image/svg+xml</dc:format> <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> + <dc:title /> </cc:Work> </rdf:RDF> </metadata> @@ -581,5 +581,29 @@ id="tspan3358" style="line-height:110.00000238%" /></text> </g> + <text + xml:space="preserve" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Urania Czech;-inkscape-font-specification:Urania Czech" + x="36" + y="444.36218" + id="text3070" + sodipodi:linespacing="125%"><tspan + sodipodi:role="line" + id="tspan3072" + x="36" + y="444.36218" + style="-inkscape-font-specification:Bitstream Vera Sans;font-family:Bitstream Vera Sans;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;font-size:16px">a)</tspan></text> + <text + sodipodi:linespacing="125%" + id="text3074" + y="444.36218" + x="376" + style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Urania Czech;-inkscape-font-specification:Urania Czech" + xml:space="preserve"><tspan + style="font-size:16px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans" + y="444.36218" + x="376" + id="tspan3076" + sodipodi:role="line">b)</tspan></text> </g> </svg> diff --git a/paper/figures/TaskWeight.pdf b/paper/figures/TaskWeight.pdf index b8edb62d24e271a46c359007c321dc44863559ff..68138a3c630a68295aa440ec325bab856320e84b 100644 Binary files a/paper/figures/TaskWeight.pdf and b/paper/figures/TaskWeight.pdf differ diff --git a/paper/figures/TaskWeight.svg b/paper/figures/TaskWeight.svg index b95886753dc0f09dcf2d9174d21d52e1e6f8ef10..4393ed8823fdd88e41d5846545bf63bed17025a3 100644 --- a/paper/figures/TaskWeight.svg +++ b/paper/figures/TaskWeight.svg @@ -14,7 +14,7 @@ id="svg5074" version="1.1" inkscape:version="0.48.4 r9939" - sodipodi:docname="New document 10"> + sodipodi:docname="TaskWeight.svg"> <defs id="defs5076"> <marker @@ -112,7 +112,7 @@ <dc:format>image/svg+xml</dc:format> <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> - <dc:title></dc:title> + <dc:title /> </cc:Work> </rdf:RDF> </metadata> @@ -215,26 +215,28 @@ xml:space="preserve" style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" x="-371.12311" - y="474.59158" + y="478.59158" id="text5713" sodipodi:linespacing="125%" transform="matrix(0,-1,1,0,0,0)"><tspan sodipodi:role="line" id="tspan5715" x="-371.12311" - y="474.59158">cost</tspan></text> + y="478.59158" + style="font-size:20px">cost</tspan></text> <text xml:space="preserve" style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold" x="-547.51123" - y="529.49603" + y="533.49603" id="text5717" sodipodi:linespacing="125%" transform="matrix(0,-1,1,0,0,0)"><tspan sodipodi:role="line" id="tspan5719" x="-547.51123" - y="529.49603">weight</tspan></text> + y="533.49603" + style="font-size:20px">weight</tspan></text> <path style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)" d="m 228.0315,308.26769 0,478.34646" @@ -251,6 +253,7 @@ sodipodi:role="line" id="tspan6101" x="-547.64691" - y="222.03149">time</tspan></text> + y="222.03149" + style="font-size:20px">time</tspan></text> </g> </svg> diff --git a/paper/paper.tex b/paper/paper.tex index c4ea1fc172a69046aba50eb3ae5b4336b6092a83..ce53b3ff4db148f45530897e145f42235604104a 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -90,8 +90,8 @@ Task dependencies are used to model the flow of data between tasks, e.g.~if task $B$ requires some data generated by task $A$, then task $B$ {\em depends} on task $A$ and cannot be executed before task $A$ has completed. -The tasks and their dependencies can be seen as the nodes and edges -of a Directed Acyclic Graph (DAG) which can be +The tasks and their dependencies can be seen as the nodes and edges, +respectively, of a Directed Acyclic Graph (DAG) which can be traversed in topological order, executing the tasks at the nodes on the way down. @@ -172,7 +172,7 @@ imperative that the execution of both tasks do not overlap, yet the order in which the tasks are executed is irrelevant. In the following, such a relationship will be referred to as a {\em conflict} between two tasks. -\fig{TaskConflicts} shows a task graph with conflicting tasks +\fig{TaskConflicts} shows a task graph extended by conflicting tasks joined by thick dashed lines. None of tasks $F$, $H$, and $I$ can be executed concurrently, i.e. they must be serialized, yet in no particular order. @@ -180,8 +180,8 @@ i.e. they must be serialized, yet in no particular order. In dependency-only systems, such conflicts can be modelled with dependencies, which enforce a pre-determined arbitrary ordering on conflicting tasks. -This unnecessary restriction on the order -in which tasks can be scheduled can severely limit the +This artificial restriction on the order +in which tasks can be scheduled can, however, severely limit the parallelizability of a computation, especially in the presence of multiple conflicts per task. Both \cite{ref:Ltaief2012} and \cite{ref:Agullo2013} note @@ -193,14 +193,14 @@ This paper presents QuickSched, a framework for task-based parallel programming with constraints, which aims to achieve the following goals: \begin{itemize} - \item {\em Correct}: All constraints, i.e.~dependencies and + \item {\em Correctnes}: All constraints, i.e.~dependencies and conflicts, must be correctly enforced, - \item {\em Fast}: The overheads associated with task management + \item {\em Speed}: The overheads associated with task management should be as small as possible, - \item {\em Memory/cache-efficient}: Tasks accessing similar + \item {\em Memory/cache efficiency}: Tasks accessing similar sets of data should be preferentially executed on the same core to preserve memory/cache locality as far as possible, and - \item {\em Parallel-efficient}: Tasks should be executed in an order + \item {\em Parallel efficiency}: Tasks should be executed in an order that sufficient work is available for all computational threads at all times. \end{itemize} @@ -237,40 +237,30 @@ directions. \section{Design Considerations} -\begin{itemize} - \item Two flavours of dependency generation: Through spawning as in Cilk, - or automatically as per QUARK, OmpSS, etc... - \item Limitation of spawning with complex task hierarchies. - \item Limitation of dependency guessing is sometimes over-protective dependencies. - \item We let the user define dependencies explictly. - \item Con: Some effort needed to get dependencies right, - \item Pro: All dependencies known at the start of a computation, better - scheduling decisions can be made (see QR). - \item A word on task granularity... -\end{itemize} - From a programmer's perspective, there are two main paradigms for generating task dependencies: \begin{itemize} - \item Implicitly via spawning and waiting, e.g.~as is done in Cilk, or - \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs. + \item Implicitly via spawning and waiting, e.g.~as is done in Cilk + \cite{ref:Blumofe1995}, or + \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs + \cite{ref:Duran2011}. \end{itemize} The first scheme, spawning and waiting, is arguably the simplest to use. For simple depedency structures in which each task depends on only a single task, i.e.~if the task DAG is a tree, each task {\em spawns}, or -creates, its dependent tasks after its completion (see \fig{Spawn}). +creates, its dependent tasks after its completion (see \fig{Spawn}a). Hence for the tasks $A$--$E$ in \fig{Tasks}, task $A$ would spawn tasks $B$ and $D$, task $B$ would spawn task $C$, and task $D$ would spawn task $E$. If a task has more than one dependency, e.g. tasks $D$--$F$ in \fig{Tasks}, then the task generation order is reversed: Task $E$ is executed first, and first spawns tasks $D$ and $F$, and waits for both their completion -before doing its own computations. +before doing its own computations (see \fig{Spawn}b). Although simple to use, this implicit dependency management -limits the types of DAGs that can be represented, i.e.~for +limits the types of DAGs that can be represented, e.g.~for all the tasks in \fig{Tasks}, using such a spawning and waiting model would create implicit dependencies between the lowest-level tasks $C$, $E$, and $K$. @@ -298,12 +288,13 @@ These data dependencies are provided explicitly by the programmer, e.g.~by describing which parameters to a task are input, output, and input/output. The dependencies are enforced in the order in which the tasks are created. -This approach usually relies on compiler extensions, e.g.~ {\tt pragma}s +This approach usually relies on compiler extensions, e.g.~{\tt pragma}s in C/C++, or a system of function call wrappers, to describe the task parameters and their intent. This approach allows programmers to specify rich dependency hierarchies -with very little effort, they still only allow for one type of relationship, +with very little effort, i.e.~without having to explicitly think about +dependencies at all, yet they still only allow for one type of relationship, i.e.~dependencies, and lack the ability to deal with conflicts as described in the previous section. They may also not be able to understand more complex memory access patterns, @@ -319,22 +310,23 @@ This approach has two main advantages: \begin{itemize} \item It gives the user maximum flexibility with regards to the structure of the task graph generated, - \item Knowing the structure of the task graph before execution + \item Knowing the complete structure of the task graph before execution allows the task scheduler to make more informed decisions as to how the tasks should be prioritized. \end{itemize} \noindent The obvious disadvantage is the burden of producing a correct -task graph is placed on the user. +task graph is placed on the programmer. Although some problems such as cyclic dependencies can be detected automatically, there is no automatic way to detect whether the -dependencies actualy reflect the intent of the user. +dependencies actualy reflect the intent of the programmer. Due to this added complexity, we consider QuickSched to be a tool not designed for casual parallel programmers, but for -those interested in investing more programming effort to achieve +those interested in investing a bit more programming effort to achieve better performance. -Conflicts between tasks or groups of tasks are not specified directly, +As opposed to the dependencies, +conflicts between tasks or groups of tasks are not specified directly, but are instead modeled as exclusive locks on a shared resource which have to be obtained by a task before it can execute. Thus, in \fig{TaskConflicts}, before executing, task $F$ has @@ -344,8 +336,8 @@ While task $F$ is being executed, neither $H$ nor $I$ can lock the same resource, and therefore will not execute until task $F$ is done and the lock has been released. -The partitioning of the computation into tasks is also left entirely -to the user. +As with all other task-based libraries, the partitioning of the +computation into tasks is also left entirely to the programmer. In theory, any program can be automatically converted to a task-based representation since each statement in the program code can be considered a single task, with dependencies to the @@ -356,8 +348,9 @@ of graph algorithms. The decomposition of a computation into tasks, however, usually involves re-thinking the underlying algorithms such that they best fit the task-based paradigm, e.g.~as in the examples in the -following sections, or as in \cite{ref:Gonnet2014}. -This process requires careful evaluation and is probably best +following sections, or as in \cite{ref:Gonnet2014,ref:Buttari2009,ref:Ltaief2012}. +This process requires careful evaluation of the underlying +computation, and is probably best not left as an automatic transformation of an otherwise serial code. Finally, the task granularity is an important issue: if the task @@ -387,7 +380,6 @@ cache efficiency of the computation. The QuickSched task scheduler consist of four main objects types: {\em task}, {\em resource}, {\em scheduler}, and {\em queue}. - The task and resource objects are used to model the computation, i.e. the work that is to be done and the data on which it will be done, respectively. @@ -438,10 +430,8 @@ The scheduler is in charge of selecting the most appropriate queue for each task, based on information stored in each task on which resources are used. Given a set of tasks with similar resources for which all -dependencies are resolved, the queue then decides which +dependencies are resolved, it is up to the queue to decide which tasks to prioritize. -This decision is made based on the length of the critical -path of the dependencies of each task. The following subsections describe these four object types in detail, as well as their operations. @@ -510,7 +500,7 @@ as per \cite{ref:Kahn1962}, and computing each task's weight up the task DAG. \begin{figure} - \centerline{\epsfig{file=figures/TaskWeight.pdf,height=0.3\textwidth}} + \centerline{\epsfig{file=figures/TaskWeight.pdf,height=0.4\textwidth}} \caption{Computation of the task weight. In this task graph, the height of each task corresponds to its computational {\em cost}. @@ -624,8 +614,8 @@ has been locked (line~5). In lines~9--10 the hold counters of the hierarchical parents are incremented using the procedure described earlier. If this process fails at any point (line~11), the -previously set hold counters are decremented (line~12) -and the lock is released (line~13). +previously set hold counters are decremented (line~13) +and the lock is released (line~14). The procedure then returns {\tt 1} or {\tt 0} if the resource could be locked or not, respectively. @@ -637,7 +627,8 @@ resource hierarchy are decremented. \subsection{Queues} The main job of the task queues is, given a set of ready tasks, -to find the task with maximum weight whose resources can all +to find the task with maximum weight, i.e.~the task along the +longest critical path, whose resources can all be locked, and to do so as efficiently as possible. One possible strategy would be to maintain an array of tasks @@ -656,16 +647,20 @@ organized as a max-heap, i.e.~where the $k$th entry is ``larger'' than both the $2k+1$st and the $2k+2$nd entry, with the task with maximum weight in the first position. -Maintaining this heap structure thus requires \oh{\log n} +Maintaining this heap structure requires \oh{\log n} operations for both insertion and deletion, i.e. for the bubble-up and trickle-down operations respectively. -The array of tasks is then traversed as if it were sorted, +Unfortunately, there is no way of efficiently traversing all +the elements in the heap in decreasing order. +The array of tasks is therefore traversed as if it were sorted, returning the first task that can be locked. Although the first task in the array will be the task with maximum weight, the following tasks are only loosely ordered, where the $k$th of $n$ tasks has a larger weight than at least $\lfloor n/k\rfloor -1$ other tasks. +Although this is not a particularly good lower bound, it turns +out to be quite sufficient in practice. The data structure for the queue is defined as follows: \begin{center}\begin{minipage}{0.9\textwidth} @@ -736,6 +731,14 @@ and the heap order is restored (line~17). Finally, the queue lock is released (line~19) and the locked task or, if no lockable task could be found, {\tt NULL} is returned. +Note that this approach of sequentially locking multiple resources +is prone to the so-called ``dining philosophers'' problem, i.e.~if +two tasks attempt, simultaneously, to lock the resources $A$ and $B$; +and $B$ and $A$, respectively, via separate queues, their respective calls +to {\tt queue\_get} will potentially fail perpetually. +This type of deadlock, however, is easily avoided by sorting the +resources in each task according to some global creiteria, e.g.~the +resource ID or the address in memory of the resource. \subsection{Scheduler} @@ -784,7 +787,8 @@ for most compilers, is used to create a parallel section in which the code between lines~4 and~11 is executed concurrently. A version using {\tt pthreads} \cite{ref:Pthreads1995} -directly is also available. +directly\footnote{In most environments, OpenMP is implemented +on top of {\tt pthreads}, e.g. gcc's libgomp.} is also available. The parallel section consists of a loop (lines~7--10) in which a task is acquired via {\tt qsched\_gettask} and its type and data are passed to a user-supplied @@ -806,24 +810,25 @@ the resources used and locked by the task, e.g.: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} void qsched_enqueue(qsched *s, struct task *t) { - int k, qid, score[s->nr_queues]; - for (k = 0; k < s->nr_queues; k++) + int best = 0, score[s->nr_queues]; + for (int k = 0; k < s->nr_queues; k++) score[k] = 0; - for (j = 0; j < t->nr_locks; j++) - score[t->locks[j]->owner] += 1; - for (j = 0; j < t->nr_uses; j++) - score[t->uses[j]->owner] += 1; - for (qid = 0, k = 1; k < s->nr_queues; k++) - if (score[k] > score[qid]) - qid = k; - queue_put(&s->queues[qid], t); + for (int k = 0; k < t->nr_locks; k++) { + int qid = t->locks[k]->owner; + if (++score[qid] > score[best]) best = qid; + } + for (int k = 0; k < t->nr_uses; k++) { + int qid = t->uses[k]->owner; + if (++score[qid] > score[best]) best = qid; + } + queue_put(&s->queues[best], t); } \end{lstlisting} \end{minipage}\end{center} \noindent where the array {\tt score} keeps a count of the task resources ``owned'', or last used, by each queue. -In lines~9--11 the queue with the highest such score is -chosen on which the task is then put (line~12). +The task is then sent to the queue with the highest such score +(line~13). The function {\tt qsched\_gettask} fetches a task from one of the queues: @@ -876,9 +881,9 @@ Once all the dependent tasks have been unlocked, the \section{Validation} -This section presents two test cases showing +This section presents two test cases that show how QuickSched can be used in real-world applications, and -providing benchmarks to assess its efficiency and scalability. +provides benchmarks to assess its efficiency and scalability. The first test is the tiled QR decomposition originally described in \cite{ref:Buttari2009}, which has been used as a benchmark by other authors \cite{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}. @@ -897,8 +902,7 @@ QuickSched library, along with scripts to run the benchmarks and generate the plots used in the following. All examples were compiled with gcc v.\,4.8.2 using the {\tt -O2 -march=native} flags and run on -a 64-core AMD Opteron 6376 machine running -at 2.67\,GHz. +a 64-core AMD Opteron 6376 machine at 2.67\,GHz. \subsection{Task-Based QR Decomposition} @@ -990,6 +994,13 @@ the DGEQRF tasks (in red) are scheduled as soon as they become available in QuickSched, thus preventing bottlenecks near the end of the computation. +Since in QuickSched the entire task structure is known explicitly +in advance, the scheduler ``knows'' that the DGEQRF tasks all +lie on the longest critical path and therefore executes them as +soon as possible. +OmpSs, does not exploit this knowledge, resulting in the less efficient +scheduling seen in \fig{QRTasks}. + \begin{figure} \centerline{\epsfig{file=figures/QR_scaling.pdf,width=0.9\textwidth}} \caption{Strong scaling and parallel efficiency of the tiled QR decomposition