diff --git a/.gitignore b/.gitignore
index 9ea508a287021304076ca317e5e7412193dabac5..d01112cd371620fa760464b9c3fc8376165547ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,8 @@ m4/ltsugar.m4
 m4/ltversion.m4
 m4/lt~obsolete.m4
 
+paper/paper.pdf
+
 /autom4te.cache
 /aclocal.m4
 /compile
diff --git a/paper/figures/QSched.pdf b/paper/figures/QSched.pdf
index 1945befe7430ab0d1b679acac21dccb52c003f4d..07d6a694d4a8aa9d9c910fa6251e0716a75fe047 100644
Binary files a/paper/figures/QSched.pdf and b/paper/figures/QSched.pdf differ
diff --git a/paper/figures/QSched.svg b/paper/figures/QSched.svg
index 16fef7155135b6613fec2be8eac55ba337e59704..c4351a5bea46c3e9b6baae67edce925316af4bd7 100644
--- a/paper/figures/QSched.svg
+++ b/paper/figures/QSched.svg
@@ -51,14 +51,14 @@
      borderopacity="1.0"
      inkscape:pageopacity="0.0"
      inkscape:pageshadow="2"
-     inkscape:zoom="1"
-     inkscape:cx="331.95424"
-     inkscape:cy="480"
+     inkscape:zoom="2"
+     inkscape:cx="508.11752"
+     inkscape:cy="548.79621"
      inkscape:document-units="px"
      inkscape:current-layer="layer1"
      showgrid="true"
-     inkscape:window-width="1280"
-     inkscape:window-height="753"
+     inkscape:window-width="2560"
+     inkscape:window-height="1393"
      inkscape:window-x="0"
      inkscape:window-y="0"
      inkscape:window-maximized="1">
@@ -74,7 +74,7 @@
         <dc:format>image/svg+xml</dc:format>
         <dc:type
            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title></dc:title>
+        <dc:title />
       </cc:Work>
     </rdf:RDF>
   </metadata>
@@ -636,5 +636,38 @@
          id="tspan7789"
          x="384"
          y="632.36218">unlock</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3056"
+       y="436.36218"
+       x="654.57031"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold"
+       xml:space="preserve"><tspan
+         y="436.36218"
+         x="654.57031"
+         id="tspan3058"
+         sodipodi:role="line">thread 0</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold"
+       x="654.57031"
+       y="496.36218"
+       id="text3060"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3062"
+         x="654.57031"
+         y="496.36218">thread 1</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3064"
+       y="556.36218"
+       x="654.57031"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Sans Bold"
+       xml:space="preserve"><tspan
+         y="556.36218"
+         x="654.57031"
+         id="tspan3066"
+         sodipodi:role="line">thread 2</tspan></text>
   </g>
 </svg>
diff --git a/paper/figures/TaskWeight.pdf b/paper/figures/TaskWeight.pdf
index 68138a3c630a68295aa440ec325bab856320e84b..20ba81c40739ca83f5a139657d5abc89c4264cc6 100644
Binary files a/paper/figures/TaskWeight.pdf and b/paper/figures/TaskWeight.pdf differ
diff --git a/paper/figures/TaskWeight.svg b/paper/figures/TaskWeight.svg
index 4393ed8823fdd88e41d5846545bf63bed17025a3..b854aa3b2ee0be594481dac83eef5135049c7644 100644
--- a/paper/figures/TaskWeight.svg
+++ b/paper/figures/TaskWeight.svg
@@ -89,14 +89,14 @@
      borderopacity="1.0"
      inkscape:pageopacity="0.0"
      inkscape:pageshadow="2"
-     inkscape:zoom="1"
-     inkscape:cx="249.5"
-     inkscape:cy="522.89698"
+     inkscape:zoom="2"
+     inkscape:cx="295.8911"
+     inkscape:cy="562.92883"
      inkscape:document-units="px"
      inkscape:current-layer="layer1"
      showgrid="true"
-     inkscape:window-width="1366"
-     inkscape:window-height="721"
+     inkscape:window-width="2560"
+     inkscape:window-height="1393"
      inkscape:window-x="0"
      inkscape:window-y="0"
      inkscape:window-maximized="1">
@@ -223,20 +223,29 @@
          id="tspan5715"
          x="-371.12311"
          y="478.59158"
-         style="font-size:20px">cost</tspan></text>
+         style="font-size:20px">cost(A)</tspan></text>
     <text
        xml:space="preserve"
-       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
-       x="-547.51123"
-       y="533.49603"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:start;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       x="525.60541"
+       y="539.85498"
        id="text5717"
-       sodipodi:linespacing="125%"
-       transform="matrix(0,-1,1,0,0,0)"><tspan
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         x="525.60541"
+         y="539.85498"
+         style="font-size:20px;text-align:start;text-anchor:start"
+         id="tspan3201">weight(A) = cost(A) + </tspan><tspan
          sodipodi:role="line"
-         id="tspan5719"
-         x="-547.51123"
-         y="533.49603"
-         style="font-size:20px">weight</tspan></text>
+         x="525.60541"
+         y="564.85498"
+         style="font-size:20px;text-align:start;text-anchor:start"
+         id="tspan3205">                    max{weight(B), </tspan><tspan
+         sodipodi:role="line"
+         x="525.60541"
+         y="589.85498"
+         style="font-size:20px;text-align:start;text-anchor:start"
+         id="tspan3207">                             weight(D)}</tspan></text>
     <path
        style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-end:url(#Arrow2Lend)"
        d="m 228.0315,308.26769 0,478.34646"
@@ -255,5 +264,77 @@
          x="-547.64691"
          y="222.03149"
          style="font-size:20px">time</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3209"
+       y="377.56561"
+       x="355.08435"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       xml:space="preserve"><tspan
+         style="font-size:20px"
+         y="377.56561"
+         x="355.08435"
+         id="tspan3211"
+         sodipodi:role="line">A</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       x="300.77371"
+       y="501.85693"
+       id="text3213"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3215"
+         x="300.77371"
+         y="501.85693"
+         style="font-size:20px">B</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3217"
+       y="589.84717"
+       x="301.13992"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       xml:space="preserve"><tspan
+         style="font-size:20px"
+         y="589.84717"
+         x="301.13992"
+         id="tspan3219"
+         sodipodi:role="line">C</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       x="406.5979"
+       y="545.14832"
+       id="text3221"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3223"
+         x="406.5979"
+         y="545.14832"
+         style="font-size:20px">D</tspan></text>
+    <text
+       sodipodi:linespacing="125%"
+       id="text3225"
+       y="677.85693"
+       x="352.81277"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       xml:space="preserve"><tspan
+         style="font-size:20px"
+         y="677.85693"
+         x="352.81277"
+         id="tspan3227"
+         sodipodi:role="line">E</tspan></text>
+    <text
+       xml:space="preserve"
+       style="font-size:12px;font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Bitstream Vera Sans;-inkscape-font-specification:Bitstream Vera Sans Bold"
+       x="458.92017"
+       y="721.14832"
+       id="text3229"
+       sodipodi:linespacing="125%"><tspan
+         sodipodi:role="line"
+         id="tspan3231"
+         x="458.92017"
+         y="721.14832"
+         style="font-size:20px">F</tspan></text>
   </g>
 </svg>
diff --git a/paper/paper.tex b/paper/paper.tex
index 03fdc39ae72c00883c4bfce2b0bf4d059e9cd008..4d1236b1b66c8a18520276d237655151ce5c0546 100644
--- a/paper/paper.tex
+++ b/paper/paper.tex
@@ -1,4 +1,4 @@
-\documentclass[preprint]{elsarticle}
+\documentclass[fleqn,10pt]{wlpeerj}
 
 % Package to generate and customize Algorithm as per ACM style
 \usepackage[ruled]{algorithm2e}
@@ -47,18 +47,16 @@
     {\textbf{\textsf{#1}}}
 
 
-% Document starts
-\begin{document}
-
 % Title portion
 \title{QuickSched: Task-based parallelism with dependencies and conflicts}
-\author[ecs,ggl]{Pedro Gonnet\corref{cor}} \ead{pedro.gonnet@durham.ac.uk}
-\author[ecs]{Aidan B. G. Chalk} \ead{aidan.chalk@durham.ac.uk}
-\author[icc]{Matthieu Schaller} \ead{matthieu.schaller@durham.ac.uk}
-\cortext[cor]{Corresponding author}
-\address[ecs]{School of Engineering and Computing Sciences, Durham University, South Road, Durham DH1 3LE, United Kingdom.}
-\address[icc]{Institute for Computational Cosmology, Durham University, South Road, Durham DH1 3LE, United Kingdom.}
-\address[ggl]{Google Switzerland GmbH, Brandshenkestr. 110, 8002 Z\"urich, Switzerland.}
+\author[1,3]{Pedro Gonnet}
+\author[1]{Aidan B. G. Chalk}
+\author[2]{Matthieu Schaller}
+\affil[1]{School of Engineering and Computing Sciences, Durham University, United Kingdom.}
+\affil[2]{Institute for Computational Cosmology, Durham University, United Kingdom.}
+\affil[3]{Google Switzerland GmbH, Z\"urich, Switzerland.}
+
+\keywords{task-based parallelism, shared-memory parallelism, high performance computing, parallel computing}
 
 \begin{abstract}
 This paper describes QuickSched, a compact and efficient Open-Source
@@ -68,16 +66,18 @@ programming with the concept of task conflicts, i.e.~sets of tasks
 that can be executed in any order, yet not concurrently.
 These conflicts are modelled using exclusively lockable
 hierarchical resources.
-The scheduler is shown to perform and scale well on a 64-core parallel
+The scheduler itself prioritizes tasks along the critical path
+of execution and is shown to perform and scale well on a 64-core parallel
 shared-memory machine for two example problems: A tiled QR
 decomposition and a task-based Barnes-Hut tree code.
 \end{abstract}
 
-\begin{keyword}
-    task-based parallelism \sep shared-memory parallelism \sep high performance computing \sep parallel computing
-\end{keyword}
+% Document starts
+\begin{document}
 
+\flushbottom
 \maketitle
+\thispagestyle{empty}
 
 
 \section{Introduction}
@@ -95,15 +95,22 @@ respectively, of a Directed Acyclic Graph (DAG) which can be
 traversed in topological order, executing the tasks at the nodes
 on the way down.
 
-This computational model is trivial to parallelize.
-Given a set of inter-dependent tasks and a set of computational
+Once modelled in such a way, the computation is somewhat trivial
+to parallelize:
+given a set of inter-dependent tasks and a set of computational
 threads, each thread repeatedly selects a task with no
 unsatisfied dependencies from the DAG and executes it.
 If no tasks are available, the thread waits until any other
 thread finishes executing a task, thus potentially releasing
 new tasks, or until all tasks in the DAG have been executed.
-\fig{Tasks} shows such a DAG for a set of tasks.
-The arrows indicate the direction of the dependency, i.e.~an
+Note that although the parallel execution
+itself is trivial, it does not always guaranteed to be efficient.
+Several factors may limit the maximum degree of parallelism, e.g.~the
+structure of the task dependency DAG itself, or the order in which
+available tasks are executed.
+
+\fig{Tasks} shows such a DAG for a set of tasks with
+arrows indicating the direction of the dependencies, i.e.~an
 arrow from task $A$ to task $B$ indicates that task $B$ depends
 on task $A$.
 In a parallel setting, tasks $A$, $G$, and $J$ can be
@@ -111,6 +118,7 @@ executed concurrently.
 Once task $G$ has completed, tasks $F$ and $H$ become available
 and can be executed by any other computational thread.
 
+
 \begin{figure}
     \centerline{\epsfig{file=figures/Tasks.pdf,width=0.5\textwidth}}
     \caption{A set of tasks (circles) and their dependencies (arrows).
@@ -126,15 +134,15 @@ and can be executed by any other computational thread.
 \end{figure}
 
 One of the first implementations of a task-based parallel programming
-systems is Cilk \cite{ref:Blumofe1995}, an extension to the C
+systems is Cilk \citep{ref:Blumofe1995}, an extension to the C
 programming language which allows function calls to be ``spawned''
 as new tasks.
 Dependencies are enforced by the {\tt sync} keyword, which
 forces a thread to wait for all the tasks that it spawned
 to complete.
 
-In SMP superscalar \cite{ref:Perez2008}, StarPU \cite{ref:Augonnet2011},
-QUARK \cite{ref:Yarkhan2011}, and KAAPI \cite{ref:Gautier2007}
+In SMP superscalar \citep{ref:Perez2008}, StarPU \citep{ref:Augonnet2011},
+QUARK \citep{ref:Yarkhan2011}, and KAAPI \citep{ref:Gautier2007}
 the programmer specifies
 what shared data each task will access, and how that data will
 be accessed, e.g.~read, write, or read-write access.
@@ -145,7 +153,7 @@ the tasks are generated.
 StarPU also provides an interface for specifying additional
 dependencies explicitly.
 Intel's Threading Building Blocks (TBB)
-\cite{ref:Reinders2010}
+\citep{ref:Reinders2010}
 provide task-based parallelism using C++ templates in which
 dependencies are handled either by explicitly waiting
 for spawned tasks, or by explicitly manipulating
@@ -153,8 +161,8 @@ task reference counters.
 
 Finally, the very popular OpenMP standard provides some basic support
 for spawning tasks, similar to Cilk, as of version 3.0
-\cite{ref:OpenMP2008}.
-OmpSs \cite{ref:Duran2011} extends this scheme with automatic
+\citep{ref:OpenMP2008}.
+OmpSs \citep{ref:Duran2011} extends this scheme with automatic
 dependency generation as in SMP superscalar, of which it
 is a direct descendant, along with
 the ability to explicitly wait on certain tasks.
@@ -168,7 +176,7 @@ Consider the case of two tasks that update some shared resource
 in an order-independent way, e.g. when accumulating a result in
 a shared variable, or exclusively writing to an output file.
 In order to avoid concurrent access to that resource, it is
-imperative that the execution of both tasks do not overlap,
+imperative that the execution of both tasks does not overlap,
 yet the order in which the tasks are executed is irrelevant.
 In the following, such a relationship will be referred to
 as a {\em conflict} between two tasks.
@@ -189,6 +197,14 @@ this problem in their respective implementations of the Fast Multipole
 Method (FMM), in which forces computed in different tasks are
 accumulated on a set of particles.
 
+Several libraries provide some mechanism to model such
+conflicts, either directly or indirectly.
+In the QUARK scheduler, conflicts can be modeled by explicitly
+marking dependencies as concurrent.
+KAAPI and OmpSS, on the other hand, allow marking access to
+certain variables as reductions, yet only for basic operations,
+e.g.~summation or maximum/minimum.
+
 This paper presents QuickSched, a framework for task-based
 parallel programming with constraints, which aims to achieve
 the following goals:
@@ -200,7 +216,8 @@ the following goals:
     \item {\em Memory/cache efficiency}: Tasks accessing similar
         sets of data should be preferentially executed on the
         same core to preserve memory/cache locality as far as possible, and
-    \item {\em Parallel efficiency}: Tasks should be executed in an order
+    \item {\em Parallel efficiency}: The order in which the tasks
+        are executed should be chosen such
         that sufficient work is available for all computational
         threads at all times.
 \end{itemize}
@@ -241,9 +258,9 @@ From a programmer's perspective, there are two main paradigms for generating
 task dependencies:
 \begin{itemize}
   \item Implicitly via spawning and waiting, e.g.~as is done in Cilk
-    \cite{ref:Blumofe1995}, or
-  \item Automatic extraction from data dependencies, e.g.~as is done in OmpSs
-    \cite{ref:Duran2011}.
+    and OpenMP~3.0, or
+  \item Automatic extraction from data dependencies, e.g.~as is done in
+    StarPU, QUARK, and OmpSs.
 \end{itemize}
 
 The first scheme, spawning and waiting, is arguably the simplest to
@@ -371,13 +388,18 @@ and granularity such that
 \end{itemize}
 \noindent The first critera is biased towards bigger tasks, while the
 second limits their size.
-The criteria are thus used to optimize the
-cache efficiency of the computation.
+The parameters controlling the size of the tasks in the examples, 
+i.e.~the tile size in the QR decomposition and the limits $n_\mathsf{max}$
+and $n_\mathsf{task}$ were determined empirically and only optimized
+to the closest power of two or rough power of ten, respectively.
+Further tuning these parameters could very likely lead to further
+performance gains, but such an effort would go beyond the scope,
+and point, of this paper.
 
 
 \section{Data Structures and Algorithms}
 
-The QuickSched task scheduler consist of four main
+The QuickSched task scheduler consists of four main
 objects types: {\em task}, {\em resource}, {\em scheduler},
 and {\em queue}.
 The task and resource objects are used
@@ -458,7 +480,7 @@ struct task {
 and {\tt uses} arrays are pointers to the contents of other
 arrays, i.e.~they are not allocated individually.
 
-What the task does is determined by the {\tt type}
+{\em What} the task does is determined by the {\tt type}
 field, e.g.~which can be mapped to any particular function,
 and the {\tt data} pointer which points to an array of
 {\tt size\_data} bytes containing data specific to the task,
@@ -494,10 +516,19 @@ for the relative computational cost of this task, and the
 relative cost of the critical path following the
 dependencies of this task, respectively, i.e.~the task's cost
 plus the maximum dependent task's weight (see \fig{TaskWeight}).
+The task cost can be either a rough estimate provided by the user,
+or the actual cost of the same task last time it was executed.
 The task weights are computed by traversing
-the tasks in reverse topological order following their dependencies, i.e.
-as per \cite{ref:Kahn1962}, and computing each task's weight
-up the task DAG.
+the tasks DAG in reverse topological order following their dependencies,
+e.g.~as per \cite{ref:Kahn1962} in $\mathcal O(n)$ for $n$ tasks,
+and computing each task's weight, e.g.
+\begin{equation*}
+  \mbox{weight}_i = \mbox{cost}_i + \max_{j \in \mbox{\small unlocks}_i}\left\{\mbox{weight}_j\right\}.
+\end{equation*}
+\noindent where $\mbox{weight}_i$ and $\mbox{cost}_i$ are the
+weight and cost of the $i$th task, respectively, and 
+$\mbox{unlocks}_i$ is the set of tasks that the $i$th task
+unlocks.
 
 \begin{figure}
     \centerline{\epsfig{file=figures/TaskWeight.pdf,height=0.4\textwidth}}
@@ -652,7 +683,7 @@ operations for both insertion and deletion, i.e. for the
 bubble-up and trickle-down operations respectively.
 
 Unfortunately, there is no way of efficiently traversing all
-the elements in the heap in decreasing order.
+the elements in such a heap in decreasing order.
 The array of tasks is therefore traversed as if it were sorted,
 returning the first task that can be locked.
 Although the first task in the array will be the task with
@@ -718,7 +749,7 @@ struct task *queue_get(struct queue *q) {
 }
     \end{lstlisting}
 \end{minipage}\end{center}
-\noindent where as with the queue insertion, the queue is first
+\noindent where, as with the queue insertion, the queue is first
 locked for exclusive access (line~4).
 The array of task pointers is then traversed (line~5),
 locking the resources of each task (lines~6--7).
@@ -728,8 +759,8 @@ are released (lines~9--10), otherwise, the traversal is aborted
 If all the locks on a task could be obtained (line~14), the
 task pointer is replaced by the last pointer in the heap (line~16)
 and the heap order is restored (line~17).
-Finally, the queue lock is released (line~19) and the locked task
-or, if no lockable task could be found, {\tt NULL} is returned.
+Finally, the queue lock is released (line~19) and the locked task,
+or {\tt NULL} if no lockable task could be found, is returned.
 
 Note that this approach of sequentially locking multiple resources
 is prone to the so-called ``dining philosophers'' problem, i.e.~if
@@ -740,6 +771,16 @@ This type of deadlock, however, is easily avoided by sorting the
 resources in each task according to some global criteria, e.g.~the
 resource ID or the address in memory of the resource.
 
+Note also that protecting the entire queue with a mutex
+is not particularly scalable, and several authors, e.g.~\cite{ref:Sundell2003},
+have presented concurrent data structures that avoid this type
+of locking.
+However, since we normally use one queue per computational thread,
+contention will only happens due to work-stealing, i.e.~when
+another idle computational thread tries to poach tasks.
+Since this happens only rarely, we opt for the simpler locking approach.
+This decision is backed by the results in Section~5.
+
 \subsection{Scheduler}
 
 The scheduler object is used as the main interface to the
@@ -782,13 +823,14 @@ void qsched_run(qsched *s, void (*fun)(int, void *)) {
 \end{minipage}\end{center}
 \noindent where {\tt qsched\_start} initializes the tasks and
 fills the queues (line~1).
-For simplicity, OpenMP \cite{ref:Dagum1998}, which is available
+For simplicity, OpenMP \citep{ref:Dagum1998}, which is available
 for most compilers, is used to create a parallel section
 in which the code between lines~4 and~11 is executed
 concurrently.
-A version using {\tt pthreads} \cite{ref:Pthreads1995}
+A version using {\tt pthreads} \citep{ref:Pthreads1995}
 directly\footnote{In most environments, OpenMP is implemented
-on top of {\tt pthreads}, e.g. gcc's libgomp.} is also available.
+on top of {\tt pthreads}, e.g. the {\tt gcc} compiler's libgomp.}
+is also available.
 The parallel section consists of a loop (lines~7--10) in
 which a task is acquired via {\tt qsched\_gettask}
 and its type and data are passed to a user-supplied
@@ -864,7 +906,7 @@ i.e.~it loops over all other queues
 in a random order (line~6) and tries to get a task from them
 (line~7).
 If a task could be obtained from any queue and task re-owning
-is switched on (line~12),
+is enabled (line~12),
 the resources it locks and uses are marked as now being owned
 by the preferred queue (lines~13--16).
 Finally, the task, or {\tt NULL} if no task could be obtained,
@@ -886,7 +928,7 @@ how QuickSched can be used in real-world applications, and
 provides benchmarks to assess its efficiency and scalability.
 The first test is the tiled QR decomposition originally
 described in \cite{ref:Buttari2009}, which has been used as a benchmark
-by other authors \cite{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}.
+by other authors \citep{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}.
 This example only requires dependencies and is presented 
 as a point of comparison to existing task-based parallel
 programming infrastructures.
@@ -912,10 +954,10 @@ parallelism for tile-based algorithms in numerical linear algebra,
 presenting parallel codes for the Cholesky, LU, and QR
 decompositions.
 These algorithms are now part of the PLASMA and MAGMA
-libraries for parallel linear algebra \cite{ref:Agullo2009}.
+libraries for parallel linear algebra \citep{ref:Agullo2009}.
 The former uses the QUARK task scheduler, which was originally
 designed for this specific task, while the latter currently uses
-the StarPU task scheduler \cite{ref:Agullo2011}.
+the StarPU task scheduler \citep{ref:Agullo2011}.
 
 \begin{figure}
     \centerline{\epsfig{file=figures/QR.pdf,width=0.9\textwidth}}
@@ -958,16 +1000,22 @@ previous level, i.e.~the task $(i,j,k)$ always depends on
 $(i,j,k-1)$ for $k>1$.
 Each task also modifies its own tile $(i,j)$, and the DTSQRF
 task additionally modifies the lower triangular part of the $(j,j)$th tile.
+
 Although the tile-based QR decomposition requires only dependencies,
 i.e.~no additional conflicts are needed to avoid concurrent access to
 the matrix tiles, we still model each tile as a separate resource
 in QuickSched such that the scheduler can preferrentially assign
 tasks using the same tiles to the same thread.
+The resources/tiles are initially assigned to the queues in column-major
+order, i.e.~the first $\lfloor n_\mathsf{tiles}/n_\mathsf{queues}\rfloor$
+are assigned to the first queue, and so on.
 
 The QR decomposition was computed for a $2048\times 2048$
 random matrix using tiles of size $64\times 64$ floats using QuickSched
 as described above.
-For this matrix, a total of 11440 tasks with 32240 dependencies
+The task costs were initialized to the asymptotic cost of the underlying
+operations.
+For this matrix, a total of 11\,440 tasks with 32\,240 dependencies
 were generated.
 
 For these tests, {\tt pthread} parallelism and resource re-owning
@@ -978,16 +1026,21 @@ efficiency results in \fig{QRResults}.
 The timings are for {\tt qsched\_run}, including the cost of
 {\tt qsched\_start}, which does not run in parallel.
 Setting up the scheduler, tasks, and resources took, in all
-cases, an average of 7.2\,ms.
+cases, an average of 7.2\,ms, i.e.~at most 3\% of the total
+computational cost.
 
 The same decomposition was implemented using OmpSs v.\,1.99.0,
 calling the kernels directly using {\tt \#pragma omp task}
 annotations with the respective dependencies, and
 the runtime parameters
 \begin{quote}
-    \tt --disable-yield --schedule=socket --cores-per-socket=16 \\--num-sockets=4
+  \tt --disable-yield --schedule=socket --cores-per-socket=16 \\--num-sockets=4
 \end{quote}
-\noindent The scaling and efficiency relative to QuickSched are 
+\noindent Several different schedulers and parameterizations
+were discussed with the authors of OmpSs and tested, with
+the above settings producing the best results.
+
+The scaling and efficiency relative to QuickSched are 
 shown in \fig{QRResults}.
 The difference in timings is the result of the different
 task scheduling policies, as well as a smaller lag between the
@@ -1003,7 +1056,7 @@ Since in QuickSched the entire task structure is known explicitly
 in advance, the scheduler ``knows'' that the DGEQRF tasks all
 lie on the longest critical path and therefore executes them as
 soon as possible.
-OmpSs, does not exploit this knowledge, resulting in the less efficient
+OmpSs does not exploit this knowledge, resulting in the less efficient
 scheduling seen in \fig{QRTasks}.
 
 \begin{figure}
@@ -1030,11 +1083,11 @@ scheduling seen in \fig{QRTasks}.
 
 \subsection{Task-Based Barnes-Hut N-Body Solver}
 
-The Barnes-Hut tree-code \cite{ref:Barnes1986}
+The Barnes-Hut tree-code \citep{ref:Barnes1986}
 is an algorithm to approximate the
 solution of an $N$-body problem, i.e.~computing all the
 pairwise interactions between a set of $N$ particles,
-in \oh{N\log N} operations, as opposed to the \oh{N^2}
+in \oh{N\log N} operations, as opposed to in \oh{N^2} for the
 naive direct computation.
 The algorithm is based on a recursive octree decomposition:
 Starting from a cubic cell containing all the particles,
@@ -1070,7 +1123,7 @@ The current approach, illustrated in \fig{CellParts} is not
 only more compact, it also allows a direct and more cache-efficient access
 to the list of particles for any inner node of the tree.
 The cost of sorting the particles, with a recursive
-partitioning similar to QuickSort \cite{ref:Hoare1962},
+partitioning similar to QuickSort \citep{ref:Hoare1962},
 is in \oh{N\log N}.
 
 \begin{figure}
@@ -1113,18 +1166,18 @@ The function recurses as follows (line numbers refer to \fig{MakeTasks}:
         recurse over all pairs of sub-cells spanning
         both cells (lines~24--26), and
     \item If called with two neighbouring cells
-        and one of the cells are not split, create
+        and at least one of the cells is not split, create
         a particle-particle pair task over both cells (line~29),
     \item If called with two non-neighbouring cells,
         do nothing, as these interactions
         will be computed by the particle-cell task.
 \end{itemize}
-\noindent where every interaction task additionally locks
+\noindent Every interaction task additionally locks
 the cells on which it operates (lines~17, 20, and 32--33).
 In order to prevent generating
 a large number of very small tasks, the task generation only recurses
 if the cells contain more than a minimum number $n_\mathsf{task}$
-of threads each (lines~7 and~23).
+of particles each (lines~7 and~23).
 
 As shown in \fig{BHTasks}, the particle-self and particle-particle pair
 interaction tasks are implemented
@@ -1163,6 +1216,11 @@ Using the above scheme generated 97\,553 tasks, of which
 512 self-interaction tasks, 5\,068 particle-particle interaction
 task, and 32\,768 particle-cell interaction tasks.
 A total of 43\,416 locks on 37\,449 resources were generated.
+Setting up the tasks took, on average XXX\,ms, i.e.~at most
+XXX\% of the total computation time.
+Storing the tasks, resources, and dependencies required XXX\,MB,
+which is only XX\% of the XXX\,MB required to store the particle
+data.
 
 For these tests, {\tt pthread}s parallelism was used and resource
 re-owning was switched off.
@@ -1179,10 +1237,10 @@ Setting up the scheduler, tasks, and resources took, in all
 cases, an average of 51.3\,ms.
 
 For comparison, the same computations were run using the popular
-astrophysics simulation software Gadget-2 \cite{ref:Springel2005},
+astrophysics simulation software Gadget-2 \citep{ref:Springel2005},
 using a traditional Barnes-Hut implementation based on octrees
 and distributed-memory parallelism based on domain decompositions
-and MPI \cite{ref:Snir1998}.
+and MPI \citep{ref:Snir1998}.
 To achieve the same accuracy, an opening angle of 0.5 was used.
 On a single core, the task-based tree traversal is already 1.9$\times$
 faster than Gadget-2, due to the cache efficiency of the task-based
@@ -1198,7 +1256,7 @@ to the MPI-based parallelism in Gadget-2.
     \caption{Strong scaling and parallel efficiency of the Barnes-Hut tree-code
         computed over 1\,000\,000 particles.
         Solving the N-Body problem takes 323\,ms, achieving 75\% parallel
-        efficiency, over all 64 cores.
+        efficiency over all 64 cores.
         For comparison, timings are shown for the same computation using
         the popular astrophysics code Gadget-2.
         The scaling for Gadget-2 (left) is shown relative to the performance of
@@ -1229,16 +1287,22 @@ At 64 cores, the scheduler overheads account for only $\sim 1$\% of
 the total computational cost, whereas,
 as of 32 cores, the cost of both pair types grow by up to
 40\%.
-This is due to memory bandwidth restrictions, as
-the cost of the particle-cell interaction tasks, which do significantly more
-computation per memory access, only grow by up to 10\%.
+This is due to the cache hierarchy of the AMD Opteron 6376 in which
+pairs of cores share a comon 2\,MB L2 cache.
+When using half the cores or less, each core has its L@ cache to
+itself, whereas beyond 32 cores they are shared, resulting in more
+frequent cache misses.
+This cen be seen when comparing the costs of the particle-particle
+interaction and particle-cell interaction tasks: while the former grow by
+roughly 30\%, the latter grow by only 10\% as they do much more
+computation per memory access.
 
 \begin{figure}
     \centerline{\epsfig{file=figures/BH_times.pdf,width=0.8\textwidth}}
     \caption{Accumulated cost of each task type and of the overheads
         associated with {\tt qsched\_gettask}, summed over all cores.
         As of 32 cores, the cost of both pair interaction task
-        types grow by up to 40\%.
+        types grow by up to 30\%.
         The cost of the particle-cell interactions, which entail significantly more
         computation per memory access, grow only by at most 10\%.
         The scheduler overheads, i.e.~{\tt qsched\_gettask},
@@ -1331,15 +1395,16 @@ v\,3.0 and is available for download via
 
 % Acknowledgments
 \section*{Acknowledgments}
-The authors would like to thank Lydia Heck of the Institute for
-Computational Cosmology at Durham University for providing access
-to, and expertise on, the COSMA cluster used in the performance
-evaluation.
-This work was supported by a Durham University Seedcorn Grant.
+The authors would like to thank Tom Theuns and Richard Bowers of the
+Institute for Computational Cosmology at Durham University for the
+helpful discussions.
+This work was supported by a Durham University Seedcorn Grant
+number 21.12.080130 from
+which the hardware used in the experiments was purchased.
 
 
 % Bibliography
-\bibliographystyle{elsarticle-num}
+% \bibliographystyle{elsarticle-num}
 \bibliography{quicksched}
 
 
@@ -1529,7 +1594,7 @@ Similarly, {\tt rid} stores the handles of the resources for each
 tile of the matrix, which are allocated in line~8.
 
 The following loops mirror the task generation described in
-Algorithm~2 of \cite{ref:Buttari2009}.
+Algorithm~2 of \citep{ref:Buttari2009}.
 For each level {\tt k} (line~10), a DGEQRF task is created
 for tile $(k,k)$ (lines~13--14).
 A lock is added for the newly created task on the
@@ -1584,7 +1649,7 @@ struct cell {
   struct cell *progeny[8];
   qsched_res_t res;
   qsched_task_t task_com;
-  };
+};
     \end{lstlisting}
 \end{minipage}\end{center}
 \noindent where {\tt loc} and {\tt h} are the location
@@ -1606,7 +1671,7 @@ data of the form:
 struct part {
   double x[3], a[3], mass;
   int id;
-  };
+};
     \end{lstlisting}
 \end{minipage}\end{center}
 \noindent i.e.~the particle position, acceleration, mass,
diff --git a/paper/quicksched.bib b/paper/quicksched.bib
index 0908731b3859e2cf9152b8e7cd10ae80929c66ae..99cbdd0165c134dc0287fa5bf1ddd4bf7b6f04a2 100644
--- a/paper/quicksched.bib
+++ b/paper/quicksched.bib
@@ -1,3 +1,12 @@
+@inproceedings{ref:Sundell2003,
+  title={Fast and lock-free concurrent priority queues for multi-thread systems},
+  author={Sundell, H{\aa}kan and Tsigas, Philippas},
+  booktitle={Parallel and Distributed Processing Symposium, 2003. Proceedings. International},
+  pages={11--pp},
+  year={2003},
+  organization={IEEE}
+}
+
 @article{ref:Barnes1986,
   title={A hierarchical O (N log N) force-calculation algorithm},
   author={Barnes, Josh and Hut, Piet},
diff --git a/paper/wlpeerj.cls b/paper/wlpeerj.cls
new file mode 100644
index 0000000000000000000000000000000000000000..3c71d7e6894f3cfb331987c5ad36871791d1e7f5
--- /dev/null
+++ b/paper/wlpeerj.cls
@@ -0,0 +1,204 @@
+%
+% An unofficial LaTeX class for PeerJ articles.
+%
+% Created by writeLaTeX.
+%
+% Based on the SelfArx document class.
+%
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesClass{wlpeerj}[23/01/2014, v1.0]
+\RequirePackage[utf8]{inputenc}
+\RequirePackage[english]{babel}
+
+\RequirePackage{ifthen}
+\RequirePackage{calc}
+\AtEndOfClass{\RequirePackage{microtype}}
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+\ProcessOptions*
+\LoadClass{article}
+\RequirePackage{times}      % Loads the Times-Roman Fonts
+\RequirePackage{mathptmx}   % Loads the Times-Roman Math Fonts
+\RequirePackage{ifpdf}
+
+\RequirePackage{amsmath,amsfonts,amssymb}
+\RequirePackage{graphicx,xcolor}
+\RequirePackage{booktabs}
+\RequirePackage{authblk}
+
+\RequirePackage[left=5cm,%
+                right=2cm,%
+                top=2.25cm,%
+                bottom=2.25cm,%
+                headheight=12pt,%
+                letterpaper]{geometry}%
+                
+\RequirePackage[labelfont={bf,sf},%
+                labelsep=period,%
+                justification=raggedright]{caption}
+                
+\RequirePackage{natbib}
+\bibliographystyle{apalike}
+
+%
+% writeLaTeX logo
+%
+\newcommand\wllogo{%
+\renewcommand*\rmdefault{ugq}\normalfont\upshape{}write%
+\renewcommand*\rmdefault{cmr}\normalfont\upshape{\bf\LaTeX}}
+                
+%
+% headers and footers
+%
+\RequirePackage{fancyhdr}  % custom headers/footers
+\RequirePackage{lastpage}  % Number of pages in the document
+\pagestyle{fancy}          % Enables the custom headers/footers
+% Headers
+\lhead{}%
+\chead{}%
+\rhead{}%
+% Footers
+\lfoot{}%
+\cfoot{}%
+\rfoot{\small\sffamily\bfseries\thepage/\pageref{LastPage}}%
+\renewcommand{\headrulewidth}{0pt}% % No header rule
+\renewcommand{\footrulewidth}{0pt}% % No footer rule
+
+%
+% section/subsection/paragraph set-up
+%
+\RequirePackage[explicit]{titlesec}
+\titleformat{\section}
+  {\color{color1}\large\sffamily\bfseries}
+  {\thesection}
+  {0.5em}
+  {\MakeUppercase{#1}}
+  []
+\titleformat{name=\section,numberless}
+  {\color{color1}\large\sffamily\bfseries}
+  {}
+  {0em}
+  {\MakeUppercase{#1}}
+  []  
+\titleformat{\subsection}
+  {\sffamily\bfseries}
+  {\thesubsection}
+  {0.5em}
+  {#1}
+  []
+\titleformat{\subsubsection}
+  {\sffamily\small\bfseries\itshape}
+  {\thesubsubsection}
+  {0.5em}
+  {#1}
+  []    
+\titleformat{\paragraph}[runin]
+  {\sffamily\small\bfseries}
+  {}
+  {0em}
+  {#1} 
+\titlespacing*{\section}{0pc}{3ex \@plus4pt \@minus3pt}{5pt}
+\titlespacing*{\subsection}{0pc}{2.5ex \@plus3pt \@minus2pt}{0pt}
+\titlespacing*{\subsubsection}{0pc}{2ex \@plus2.5pt \@minus1.5pt}{0pt}
+\titlespacing*{\paragraph}{0pc}{1.5ex \@plus2pt \@minus1pt}{10pt}
+
+%
+% tableofcontents set-up
+%
+\usepackage{titletoc}
+\contentsmargin{0cm}
+\titlecontents{section}[\tocsep]
+  {\addvspace{4pt}\small\bfseries\sffamily}
+  {\contentslabel[\thecontentslabel]{\tocsep}}
+  {}
+  {\hfill\thecontentspage}
+  []
+\titlecontents{subsection}[\tocsep]
+  {\addvspace{2pt}\small\sffamily}
+  {\contentslabel[\thecontentslabel]{\tocsep}}
+  {}
+  {\ \titlerule*[.5pc]{.}\ \thecontentspage}
+  []
+\titlecontents*{subsubsection}[\tocsep]
+  {\footnotesize\sffamily}
+  {}
+  {}
+  {}
+  [\ \textbullet\ ]  
+  
+\RequirePackage{enumitem}
+%\setlist{nolistsep} % Uncomment to remove spacing between items in lists (enumerate, itemize)
+
+% Remove brackets from numbering in List of References
+\renewcommand{\@biblabel}[1]{\bfseries\color{color1}\textsuperscript{[#1]}}
+
+%
+% article meta data
+%
+\newcommand{\keywords}[1]{\def\@keywords{#1}}
+
+\def\xabstract{abstract}
+\long\def\abstract#1\end#2{\def\two{#2}\ifx\two\xabstract 
+\long\gdef\theabstract{\ignorespaces#1}
+\def\go{\end{abstract}}\else
+\typeout{^^J^^J PLEASE DO NOT USE ANY \string\begin\space \string\end^^J
+COMMANDS WITHIN ABSTRACT^^J^^J}#1\end{#2}
+\gdef\theabstract{\vskip12pt BADLY FORMED ABSTRACT: PLEASE DO
+NOT USE {\tt\string\begin...\string\end} COMMANDS WITHIN
+THE ABSTRACT\vskip12pt}\let\go\relax\fi
+\go}
+
+%
+% custom title page 
+%
+\renewcommand{\@maketitle}{%
+{%
+\thispagestyle{empty}%
+\vskip-36pt%
+{\raggedright\sffamily\bfseries\fontsize{20}{25}\selectfont \@title\par}%
+\vskip10pt
+{\raggedright\sffamily\fontsize{12}{16}\selectfont  \@author\par}
+\vskip18pt%
+{%
+\noindent
+{\parbox{\dimexpr\linewidth-2\fboxsep\relax}{\color{color1}\large\sffamily\textbf{ABSTRACT}}}
+}%
+\vskip10pt
+{%
+\noindent
+\colorbox{color2}{%
+\parbox{\dimexpr\linewidth-2\fboxsep\relax}{%
+\sffamily\small\textbf\\\theabstract
+}%
+}%
+\vskip18pt%
+\noindent
+\parbox{\dimexpr\linewidth-2\fboxsep\relax}{%
+{\color{color1}\keywordname\hspace*{1em}} \@keywords%
+}%
+}%
+\vskip25pt%
+}%
+}%
+%-----------------------------------------------
+\setlength{\columnsep}{0.55cm} % Distance between the two columns of text
+\setlength{\fboxrule}{0.75pt} % Width of the border around the abstract
+
+\definecolor{color1}{RGB}{0,0,0} % Color of section headings
+\definecolor{color2}{RGB}{250,232,207} % Color of the box behind the abstract
+\newcommand{\keywordname}{Keywords:} % Defines the keywords heading name
+
+\renewcommand\Authfont{\fontsize{12}{12}\usefont{OT1}{phv}{b}{n}}
+\renewcommand\Affilfont{\fontsize{10}{10}\usefont{OT1}{phv}{b}{n}}
+
+\newlength{\tocsep} 
+\setlength\tocsep{1.5pc} % Sets the indentation of the sections in the table of contents
+\setcounter{tocdepth}{3} % Show only three levels in the table of contents section: sections, subsections and subsubsections
+
+\usepackage{lipsum} % Required to insert dummy text
+%-----------------------------------------------
+\let\oldbibliography\thebibliography
+\renewcommand{\thebibliography}[1]{%
+\addcontentsline{toc}{section}{\hspace*{-\tocsep}\refname}%
+\oldbibliography{#1}%
+\setlength\itemsep{0pt}%
+}
\ No newline at end of file