diff --git a/paper/figures/Spawn.pdf b/paper/figures/Spawn.pdf
index 263a78ebd1469b9299f37009926eca1d83faa697..a444386670c83f969883eb397d29298211ccac4f 100644
Binary files a/paper/figures/Spawn.pdf and b/paper/figures/Spawn.pdf differ
diff --git a/paper/figures/Spawn.svg b/paper/figures/Spawn.svg
index e58a69e9cbfe4f6f9291044b9eed9c7ae6e45e0b..2b0eea72cb09ea2c26b359adf20e2303dd39c257 100644
--- a/paper/figures/Spawn.svg
+++ b/paper/figures/Spawn.svg
@@ -95,7 +95,7 @@
         <dc:format>image/svg+xml</dc:format>
         <dc:type
            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
+        <dc:title />
       </cc:Work>
     </rdf:RDF>
   </metadata>
@@ -581,5 +581,29 @@
            id="tspan3358"
            style="line-height:110.00000238%" /></text>
     </g>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Urania Czech;-inkscape-font-specification:Urania Czech"
+       x="36"
+       y="444.36218"
+       id="text3070"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3072"
+         x="36"
+         y="444.36218"
+         style="-inkscape-font-specification:Bitstream Vera Sans;font-family:Bitstream Vera Sans;font-weight:normal;font-style:normal;font-stretch:normal;font-variant:normal;font-size:16px">a)</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3074"
+       y="444.36218"
+       x="376"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Urania Czech;-inkscape-font-specification:Urania Czech"
+       xml:space="preserve"><tspan
+         style="font-size:16px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans"
+         y="444.36218"
+         x="376"
+         id="tspan3076"
+         sodipodi:role="line">b)</tspan></text>
   </g>
 </svg>
diff --git a/paper/figures/TaskWeight.pdf b/paper/figures/TaskWeight.pdf
index b8edb62d24e271a46c359007c321dc44863559ff..68138a3c630a68295aa440ec325bab856320e84b 100644
Binary files a/paper/figures/TaskWeight.pdf and b/paper/figures/TaskWeight.pdf differ
diff --git a/paper/figures/TaskWeight.svg b/paper/figures/TaskWeight.svg
index b95886753dc0f09dcf2d9174d21d52e1e6f8ef10..4393ed8823fdd88e41d5846545bf63bed17025a3 100644
--- a/paper/figures/TaskWeight.svg
+++ b/paper/figures/TaskWeight.svg
@@ -14,7 +14,7 @@
    id="svg5074"
    version="1.1"
    inkscape:version="0.48.4 r9939"
-   sodipodi:docname="New document 10">
+   sodipodi:docname="TaskWeight.svg">
   <defs
      id="defs5076">
     <marker
@@ -112,7 +112,7 @@
         <dc:format>image/svg+xml</dc:format>
         <dc:type
            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
+        <dc:title />
       </cc:Work>
     </rdf:RDF>
   </metadata>
@@ -215,26 +215,28 @@
        xml:space="preserve"
        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
        x="-371.12311"
-       y="474.59158"
+       y="478.59158"
        id="text5713"
        sodipodi:linespacing="125%"
        transform="matrix(0,-1,1,0,0,0)"><tspan
          sodipodi:role="line"
          id="tspan5715"
          x="-371.12311"
-         y="474.59158">cost</tspan></text>
+         y="478.59158"
+         style="font-size:20px">cost</tspan></text>
     <text
        xml:space="preserve"
        style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
        x="-547.51123"
-       y="529.49603"
+       y="533.49603"
        id="text5717"
        sodipodi:linespacing="125%"
        transform="matrix(0,-1,1,0,0,0)"><tspan
          sodipodi:role="line"
          id="tspan5719"
          x="-547.51123"
-         y="529.49603">weight</tspan></text>
+         y="533.49603"
+         style="font-size:20px">weight</tspan></text>
     <path
        style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
        d="m 228.0315,308.26769 0,478.34646"
@@ -251,6 +253,7 @@
          sodipodi:role="line"
          id="tspan6101"
          x="-547.64691"
-         y="222.03149">time</tspan></text>
+         y="222.03149"
+         style="font-size:20px">time</tspan></text>
   </g>
 </svg>
diff --git a/paper/paper.tex b/paper/paper.tex
index c4ea1fc172a69046aba50eb3ae5b4336b6092a83..ce53b3ff4db148f45530897e145f42235604104a 100644
--- a/paper/paper.tex
+++ b/paper/paper.tex
@@ -90,8 +90,8 @@ Task dependencies are used to model the flow of data between
 tasks, e.g.~if task $B$ requires some data generated by task $A$,
 then task $B$ {\em depends} on task $A$ and cannot be executed
 before task $A$ has completed.
-The tasks and their dependencies can be seen as the nodes and edges
-of a Directed Acyclic Graph (DAG) which can be
+The tasks and their dependencies can be seen as the nodes and edges,
+respectively, of a Directed Acyclic Graph (DAG) which can be
 traversed in topological order, executing the tasks at the nodes
 on the way down.
 
@@ -172,7 +172,7 @@ imperative that the execution of both tasks do not overlap,
 yet the order in which the tasks are executed is irrelevant.
 In the following, such a relationship will be referred to
 as a {\em conflict} between two tasks.
-\fig{TaskConflicts} shows a task graph with conflicting tasks
+\fig{TaskConflicts} shows a task graph extended by conflicting tasks
 joined by thick dashed lines.
 None of tasks $F$, $H$, and $I$ can be executed concurrently,
 i.e. they must be serialized, yet in no particular order.
@@ -180,8 +180,8 @@ i.e. they must be serialized, yet in no particular order.
 In dependency-only systems, such conflicts can be modelled
 with dependencies, which enforce a pre-determined arbitrary
 ordering on conflicting tasks.
-This unnecessary restriction on the order
-in which tasks can be scheduled can severely limit the
+This artificial restriction on the order
+in which tasks can be scheduled can, however, severely limit the
 parallelizability of a computation, especially in the presence
 of multiple conflicts per task.
 Both \cite{ref:Ltaief2012} and \cite{ref:Agullo2013} note
@@ -193,14 +193,14 @@ This paper presents QuickSched, a framework for task-based
 parallel programming with constraints, which aims to achieve
 the following goals:
 \begin{itemize}
-    \item {\em Correct}: All constraints, i.e.~dependencies and
+    \item {\em Correctnes}: All constraints, i.e.~dependencies and
         conflicts, must be correctly enforced,
-    \item {\em Fast}: The overheads associated with task management
+    \item {\em Speed}: The overheads associated with task management
         should be as small as possible,
-    \item {\em Memory/cache-efficient}: Tasks accessing similar
+    \item {\em Memory/cache efficiency}: Tasks accessing similar
         sets of data should be preferentially executed on the
         same core to preserve memory/cache locality as far as possible, and
-    \item {\em Parallel-efficient}: Tasks should be executed in an order
+    \item {\em Parallel efficiency}: Tasks should be executed in an order
         that sufficient work is available for all computational
         threads at all times.
 \end{itemize}
@@ -237,40 +237,30 @@ directions.
 
 \section{Design Considerations}
 
-\begin{itemize}
-  \item Two flavours of dependency generation: Through spawning as in Cilk,
-    or automatically as per QUARK, OmpSS, etc...
-  \item Limitation of spawning with complex task hierarchies.
-  \item Limitation of dependency guessing is sometimes over-protective dependencies.
-  \item We let the user define dependencies explictly.
-  \item Con: Some effort needed to get dependencies right,
-  \item Pro: All dependencies known at the start of a computation, better
-    scheduling decisions can be made (see QR).
-  \item A word on task granularity...
-\end{itemize}
-
 From a programmer's perspective, there are two main paradigms for generating
 task dependencies:
 \begin{itemize}
-  \item Implicitly via spawning and waiting, e.g.~as is done in Cilk, or
-  \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs.
+  \item Implicitly via spawning and waiting, e.g.~as is done in Cilk
+    \cite{ref:Blumofe1995}, or
+  \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs
+    \cite{ref:Duran2011}.
 \end{itemize}
 
 The first scheme, spawning and waiting, is arguably the simplest to
 use.
 For simple depedency structures in which each task depends on only a
 single task, i.e.~if the task DAG is a tree, each task {\em spawns}, or
-creates, its dependent tasks after its completion (see \fig{Spawn}).
+creates, its dependent tasks after its completion (see \fig{Spawn}a).
 Hence for the tasks $A$--$E$ in \fig{Tasks}, task $A$ would spawn
 tasks $B$ and $D$, task $B$ would spawn task $C$, and task $D$ would
 spawn task $E$.
 If a task has more than one dependency, e.g. tasks $D$--$F$ in \fig{Tasks},
 then the task generation order is reversed: Task $E$ is executed first,
 and first spawns tasks $D$ and $F$, and waits for both their completion
-before doing its own computations.
+before doing its own computations (see \fig{Spawn}b).
 
 Although simple to use, this implicit dependency management
-limits the types of DAGs that can be represented, i.e.~for
+limits the types of DAGs that can be represented, e.g.~for
 all the tasks in \fig{Tasks}, using such a spawning and waiting model
 would create implicit dependencies between the lowest-level
 tasks $C$, $E$, and $K$.
@@ -298,12 +288,13 @@ These data dependencies are provided explicitly by the programmer, e.g.~by
 describing which parameters to a task are input, output, and input/output.
 The dependencies are enforced in the order in which the tasks are 
 created.
-This approach usually relies on compiler extensions, e.g.~ {\tt pragma}s
+This approach usually relies on compiler extensions, e.g.~{\tt pragma}s
 in C/C++, or a system of function call wrappers, to describe the task parameters
 and their intent.
 
 This approach allows programmers to specify rich dependency hierarchies
-with very little effort, they still only allow for one type of relationship,
+with very little effort, i.e.~without having to explicitly think about
+dependencies at all, yet they still only allow for one type of relationship,
 i.e.~dependencies, and lack the ability to deal with conflicts as
 described in the previous section.
 They may also not be able to understand more complex memory access patterns,
@@ -319,22 +310,23 @@ This approach has two main advantages:
 \begin{itemize}
   \item It gives the user maximum flexibility with regards to the
     structure of the task graph generated,
-  \item Knowing the structure of the task graph before execution
+  \item Knowing the complete structure of the task graph before execution
     allows the task scheduler to make more informed decisions
     as to how the tasks should be prioritized.
 \end{itemize}
 
 \noindent The obvious disadvantage is the burden of producing a correct
-task graph is placed on the user.
+task graph is placed on the programmer.
 Although some problems such as cyclic dependencies can be detected
 automatically, there is no automatic way to detect whether the
-dependencies actualy reflect the intent of the user.
+dependencies actualy reflect the intent of the programmer.
 Due to this added complexity, we consider QuickSched to be
 a tool not designed for casual parallel programmers, but for 
-those interested in investing more programming effort to achieve
+those interested in investing a bit more programming effort to achieve
 better performance.
 
-Conflicts between tasks or groups of tasks are not specified directly,
+As opposed to the dependencies,
+conflicts between tasks or groups of tasks are not specified directly,
 but are instead modeled as exclusive locks on a shared resource
 which have to be obtained by a task before it can execute.
 Thus, in \fig{TaskConflicts}, before executing, task $F$ has
@@ -344,8 +336,8 @@ While task $F$ is being executed, neither $H$ nor $I$ can
 lock the same resource, and therefore will not execute until
 task $F$ is done and the lock has been released.
 
-The partitioning of the computation into tasks is also left entirely
-to the user.
+As with all other task-based libraries, the partitioning of the
+computation into tasks is also left entirely to the programmer.
 In theory, any program can be automatically converted to a task-based
 representation since each statement in the program code
 can be considered a single task, with dependencies to the
@@ -356,8 +348,9 @@ of graph algorithms.
 The decomposition of a computation into tasks, however, usually
 involves re-thinking the underlying algorithms such that they
 best fit the task-based paradigm, e.g.~as in the examples in the
-following sections, or as in \cite{ref:Gonnet2014}.
-This process requires careful evaluation and is probably best
+following sections, or as in \cite{ref:Gonnet2014,ref:Buttari2009,ref:Ltaief2012}.
+This process requires careful evaluation of the underlying
+computation, and is probably best
 not left as an automatic transformation of an otherwise serial code.
 
 Finally, the task granularity is an important issue: if the task
@@ -387,7 +380,6 @@ cache efficiency of the computation.
 The QuickSched task scheduler consist of four main
 objects types: {\em task}, {\em resource}, {\em scheduler},
 and {\em queue}.
-
 The task and resource objects are used
 to model the computation, i.e. the work that is to be done
 and the data on which it will be done, respectively.
@@ -438,10 +430,8 @@ The scheduler is in charge of selecting the most appropriate
 queue for each task, based on information stored in each task
 on which resources are used.
 Given a set of tasks with similar resources for which all
-dependencies are resolved, the queue then decides which
+dependencies are resolved, it is up to the queue to decide which
 tasks to prioritize.
-This decision is made based on the length of the critical
-path of the dependencies of each task.
 
 The following subsections describe these four object types
 in detail, as well as their operations.
@@ -510,7 +500,7 @@ as per \cite{ref:Kahn1962}, and computing each task's weight
 up the task DAG.
 
 \begin{figure}
-    \centerline{\epsfig{file=figures/TaskWeight.pdf,height=0.3\textwidth}}
+    \centerline{\epsfig{file=figures/TaskWeight.pdf,height=0.4\textwidth}}
     \caption{Computation of the task weight.
       In this task graph, the height of each task corresponds to its
       computational {\em cost}.
@@ -624,8 +614,8 @@ has been locked (line~5).
 In lines~9--10 the hold counters of the hierarchical parents
 are incremented using the procedure described earlier.
 If this process fails at any point (line~11), the
-previously set hold counters are decremented (line~12)
-and the lock is released (line~13).
+previously set hold counters are decremented (line~13)
+and the lock is released (line~14).
 The procedure then returns {\tt 1} or {\tt 0} if the resource
 could be locked or not, respectively.
 
@@ -637,7 +627,8 @@ resource hierarchy are decremented.
 \subsection{Queues}
 
 The main job of the task queues is, given a set of ready tasks,
-to find the task with maximum weight whose resources can all
+to find the task with maximum weight, i.e.~the task along the
+longest critical path, whose resources can all
 be locked, and to do so as efficiently as possible.
 
 One possible strategy would be to maintain an array of tasks
@@ -656,16 +647,20 @@ organized as a max-heap, i.e.~where the $k$th entry is ``larger''
 than both the $2k+1$st and the $2k+2$nd entry,
 with the task with maximum weight
 in the first position.
-Maintaining this heap structure thus requires \oh{\log n}
+Maintaining this heap structure requires \oh{\log n}
 operations for both insertion and deletion, i.e. for the
 bubble-up and trickle-down operations respectively.
 
-The array of tasks is then traversed as if it were sorted,
+Unfortunately, there is no way of efficiently traversing all
+the elements in the heap in decreasing order.
+The array of tasks is therefore traversed as if it were sorted,
 returning the first task that can be locked.
 Although the first task in the array will be the task with
 maximum weight, the following tasks are only loosely ordered,
 where the $k$th of $n$ tasks has a larger weight than at least
 $\lfloor n/k\rfloor -1$ other tasks.
+Although this is not a particularly good lower bound, it turns
+out to be quite sufficient in practice.
 
 The data structure for the queue is defined as follows:
 \begin{center}\begin{minipage}{0.9\textwidth}
@@ -736,6 +731,14 @@ and the heap order is restored (line~17).
 Finally, the queue lock is released (line~19) and the locked task
 or, if no lockable task could be found, {\tt NULL} is returned.
 
+Note that this approach of sequentially locking multiple resources
+is prone to the so-called ``dining philosophers'' problem, i.e.~if
+two tasks attempt, simultaneously, to lock the resources $A$ and $B$;
+and $B$ and $A$, respectively, via separate queues, their respective calls
+to {\tt queue\_get} will potentially fail perpetually.
+This type of deadlock, however, is easily avoided by sorting the
+resources in each task according to some global creiteria, e.g.~the
+resource ID or the address in memory of the resource.
 
 \subsection{Scheduler}
 
@@ -784,7 +787,8 @@ for most compilers, is used to create a parallel section
 in which the code between lines~4 and~11 is executed
 concurrently.
 A version using {\tt pthreads} \cite{ref:Pthreads1995}
-directly is also available.
+directly\footnote{In most environments, OpenMP is implemented
+on top of {\tt pthreads}, e.g. gcc's libgomp.} is also available.
 The parallel section consists of a loop (lines~7--10) in
 which a task is acquired via {\tt qsched\_gettask}
 and its type and data are passed to a user-supplied
@@ -806,24 +810,25 @@ the resources used and locked by the task, e.g.:
 \begin{center}\begin{minipage}{0.9\textwidth}
     \begin{lstlisting}
 void qsched_enqueue(qsched *s, struct task *t) {
-  int k, qid, score[s->nr_queues];
-  for (k = 0; k < s->nr_queues; k++)
+  int best = 0, score[s->nr_queues];
+  for (int k = 0; k < s->nr_queues; k++)
     score[k] = 0;
-  for (j = 0; j < t->nr_locks; j++)
-    score[t->locks[j]->owner] += 1;
-  for (j = 0; j < t->nr_uses; j++)
-    score[t->uses[j]->owner] += 1;
-  for (qid = 0, k = 1; k < s->nr_queues; k++)
-    if (score[k] > score[qid])
-      qid = k;
-  queue_put(&s->queues[qid], t);
+  for (int k = 0; k < t->nr_locks; k++) {
+    int qid = t->locks[k]->owner;
+    if (++score[qid] > score[best]) best = qid;
+  }
+  for (int k = 0; k < t->nr_uses; k++) {
+    int qid = t->uses[k]->owner;
+    if (++score[qid] > score[best]) best = qid;
+  }
+  queue_put(&s->queues[best], t);
 }
     \end{lstlisting}
 \end{minipage}\end{center}
 \noindent where the array {\tt score} keeps a count of the
 task resources ``owned'', or last used, by each queue.
-In lines~9--11 the queue with the highest such score is 
-chosen on which the task is then put (line~12).
+The task is then sent to the queue with the highest such score
+(line~13).
 
 The function {\tt qsched\_gettask} fetches a task from
 one of the queues:
@@ -876,9 +881,9 @@ Once all the dependent tasks have been unlocked, the
 
 \section{Validation}
 
-This section presents two test cases showing
+This section presents two test cases that show
 how QuickSched can be used in real-world applications, and
-providing benchmarks to assess its efficiency and scalability.
+provides benchmarks to assess its efficiency and scalability.
 The first test is the tiled QR decomposition originally
 described in \cite{ref:Buttari2009}, which has been used as a benchmark
 by other authors \cite{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}.
@@ -897,8 +902,7 @@ QuickSched library, along with scripts to run the benchmarks
 and generate the plots used in the following.
 All examples were compiled with gcc v.\,4.8.2 using the
 {\tt -O2 -march=native} flags and run on
-a 64-core AMD Opteron 6376 machine running
-at 2.67\,GHz.
+a 64-core AMD Opteron 6376 machine at 2.67\,GHz.
 
 
 \subsection{Task-Based QR Decomposition}
@@ -990,6 +994,13 @@ the DGEQRF tasks (in red) are scheduled as soon as they
 become available in QuickSched, thus preventing bottlenecks
 near the end of the computation.
 
+Since in QuickSched the entire task structure is known explicitly
+in advance, the scheduler ``knows'' that the DGEQRF tasks all
+lie on the longest critical path and therefore executes them as
+soon as possible.
+OmpSs, does not exploit this knowledge, resulting in the less efficient
+scheduling seen in \fig{QRTasks}.
+
 \begin{figure}
     \centerline{\epsfig{file=figures/QR_scaling.pdf,width=0.9\textwidth}}
     \caption{Strong scaling and parallel efficiency of the tiled QR decomposition