diff --git a/paper/figures/BH_times.eps b/paper/figures/BH_times.eps new file mode 100644 index 0000000000000000000000000000000000000000..f4a80a20e6e5e69ce99899b03a6f6f23341e7837 --- /dev/null +++ b/paper/figures/BH_times.eps @@ -0,0 +1,457 @@ +%!PS-Adobe-3.0 EPSF-3.0 +%%Creator: MATLAB, The MathWorks, Inc. Version 8.0.0.783 (R2012b). Operating System: Linux 3.8.0-30-generic #44-Ubuntu SMP Thu Aug 22 20:52:24 UTC 2013 x86_64. +%%Title: figures/BH_times.eps +%%CreationDate: 11/06/2013 09:10:44 +%%DocumentNeededFonts: Helvetica +%%DocumentProcessColors: Cyan Magenta Yellow Black +%%LanguageLevel: 2 +%%Pages: 1 +%%BoundingBox: 30 16 493 297 +%%EndComments + +%%BeginProlog +% MathWorks dictionary +/MathWorks 160 dict begin +% definition operators +/bdef {bind def} bind def +/ldef {load def} bind def +/xdef {exch def} bdef +/xstore {exch store} bdef +% operator abbreviations +/c /clip ldef +/cc /concat ldef +/cp /closepath ldef +/gr /grestore ldef +/gs /gsave ldef +/mt /moveto ldef +/np /newpath ldef +/cm /currentmatrix ldef +/sm /setmatrix ldef +/rm /rmoveto ldef +/rl /rlineto ldef +/s {show newpath} bdef +/sc {setcmykcolor} bdef +/sr /setrgbcolor ldef +/sg /setgray ldef +/w /setlinewidth ldef +/j /setlinejoin ldef +/cap /setlinecap ldef +/rc {rectclip} bdef +/rf {rectfill} bdef +% page state control +/pgsv () def +/bpage {/pgsv save def} bdef +/epage {pgsv restore} bdef +/bplot /gsave ldef +/eplot {stroke grestore} bdef +% orientation switch +/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def +% coordinate system mappings +/dpi2point 0 def +% font control +/FontSize 0 def +/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0] + makefont setfont} bdef +/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse + exch dup 3 1 roll findfont dup length dict begin + { 1 index /FID ne {def}{pop pop} ifelse } forall + /Encoding exch def currentdict end definefont pop} bdef +/isroman {findfont /CharStrings get /Agrave known} bdef +/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse + exch FMS} bdef +/csm {1 dpi2point div -1 dpi2point div scale neg translate + dup landscapeMode eq {pop -90 rotate} + {rotateMode eq {90 rotate} if} ifelse} bdef +% line types: solid, dotted, dashed, dotdash +/SO { [] 0 setdash } bdef +/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef +/DA { [6 dpi2point mul] 0 setdash } bdef +/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4 + dpi2point mul] 0 setdash } bdef +% macros for lines and objects +/L {lineto stroke} bdef +/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef +/AP {{rlineto} repeat} bdef +/PDlw -1 def +/W {/PDlw currentlinewidth def setlinewidth} def +/PP {closepath eofill} bdef +/DP {closepath stroke} bdef +/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto + neg 0 exch rlineto closepath} bdef +/FR {MR stroke} bdef +/PR {MR fill} bdef +/L1i {{currentfile picstr readhexstring pop} image} bdef +/tMatrix matrix def +/MakeOval {newpath tMatrix currentmatrix pop translate scale +0 0 1 0 360 arc tMatrix setmatrix} bdef +/FO {MakeOval stroke} bdef +/PO {MakeOval fill} bdef +/PD {currentlinewidth 2 div 0 360 arc fill + PDlw -1 eq not {PDlw w /PDlw -1 def} if} def +/FA {newpath tMatrix currentmatrix pop translate scale + 0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef +/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale + 0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef +/FAn {newpath tMatrix currentmatrix pop translate scale + 0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef +/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale + 0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef +/vradius 0 def /hradius 0 def /lry 0 def +/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def +/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef + /ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly + vradius add translate hradius vradius scale 0 0 1 180 270 arc + tMatrix setmatrix lrx hradius sub uly vradius add translate + hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix + lrx hradius sub lry vradius sub translate hradius vradius scale + 0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub + translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix + closepath} bdef +/FRR {MRR stroke } bdef +/PRR {MRR fill } bdef +/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def + newpath tMatrix currentmatrix pop ulx rad add uly rad add translate + rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad + sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix + closepath} bdef +/FlrRR {MlrRR stroke } bdef +/PlrRR {MlrRR fill } bdef +/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def + newpath tMatrix currentmatrix pop ulx rad add uly rad add translate + rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad + sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix + closepath} bdef +/FtbRR {MtbRR stroke } bdef +/PtbRR {MtbRR fill } bdef +/stri 6 array def /dtri 6 array def +/smat 6 array def /dmat 6 array def +/tmat1 6 array def /tmat2 6 array def /dif 3 array def +/asub {/ind2 exch def /ind1 exch def dup dup + ind1 get exch ind2 get sub exch } bdef +/tri_to_matrix { + 2 0 asub 3 1 asub 4 0 asub 5 1 asub + dup 0 get exch 1 get 7 -1 roll astore } bdef +/compute_transform { + dmat dtri tri_to_matrix tmat1 invertmatrix + smat stri tri_to_matrix tmat2 concatmatrix } bdef +/ds {stri astore pop} bdef +/dt {dtri astore pop} bdef +/db {2 copy /cols xdef /rows xdef mul dup 3 mul string + currentfile + 3 index 0 eq {/ASCIIHexDecode filter} + {/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if } + ifelse exch readstring pop + dup 0 3 index getinterval /rbmap xdef + dup 2 index dup getinterval /gbmap xdef + 1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef +/it {gs np dtri aload pop moveto lineto lineto cp c + cols rows 8 compute_transform + rbmap gbmap bbmap true 3 colorimage gr}bdef +/il {newpath moveto lineto stroke}bdef +currentdict end def +%%EndProlog + +%%BeginSetup +MathWorks begin + +0 cap + +end +%%EndSetup + +%%Page: 1 1 +%%BeginPageSetup +%%PageBoundingBox: 30 16 493 297 +MathWorks begin +bpage +%%EndPageSetup + +%%BeginObject: obj1 +bplot + +/dpi2point 12 def +portraitMode 0192 3636 csm + + 174 65 5557 3375 rc +95 dict begin %Colortable dictionary +/c0 { 0.000000 0.000000 0.000000 sr} bdef +/c1 { 1.000000 1.000000 1.000000 sr} bdef +/c2 { 0.900000 0.000000 0.000000 sr} bdef +/c3 { 0.000000 0.820000 0.000000 sr} bdef +/c4 { 0.000000 0.000000 0.800000 sr} bdef +/c5 { 0.910000 0.820000 0.320000 sr} bdef +/c6 { 1.000000 0.260000 0.820000 sr} bdef +/c7 { 0.000000 0.820000 0.820000 sr} bdef +c0 +1 j +1 sg + 0 0 6269 3445 rf +6 w +0 2756 5014 0 0 -2756 627 3100 4 MP +PP +-5014 0 0 2756 5014 0 0 -2756 627 3100 5 MP stroke +4 w +DO +SO +6 w +0 sg + 627 3100 mt 5641 3100 L + 627 3100 mt 627 344 L +1343 3100 mt 1343 3049 L +%%IncludeResource: font Helvetica +/Helvetica /ISOLatin1Encoding 120 FMSR + +1277 3245 mt +(10) s +2139 3100 mt 2139 3049 L +2073 3245 mt +(20) s +2935 3100 mt 2935 3049 L +2869 3245 mt +(30) s +3730 3100 mt 3730 3049 L +3664 3245 mt +(40) s +4526 3100 mt 4526 3049 L +4460 3245 mt +(50) s +5322 3100 mt 5322 3049 L +5256 3245 mt +(60) s + 627 3100 mt 677 3100 L + 526 3144 mt +(0) s + 627 2488 mt 677 2488 L + 459 2532 mt +(50) s + 627 1877 mt 677 1877 L + 392 1921 mt +(100) s + 627 1266 mt 677 1266 L + 392 1310 mt +(150) s + 627 655 mt 677 655 L + 392 699 mt +(200) s +gs 627 344 5015 2757 rc +/c8 { 1.000000 0.133333 0.000000 sr} bdef +c8 +0 2503 80 3 80 -42 79 -65 80 -40 79 -28 80 -27 80 -20 +79 -60 80 43 79 -40 80 -54 80 -21 79 -54 80 16 79 -41 +80 -20 79 -49 80 -12 80 -47 79 2 80 -39 79 -10 80 -33 +80 -7 79 -10 80 -45 79 -7 80 -22 80 -39 79 -7 80 -11 +79 22 80 -60 79 1 80 -21 80 2 79 -14 80 -3 79 23 +80 -17 80 -6 79 -11 80 -1 79 -11 80 -1 80 -4 79 4 +80 4 79 -10 80 -9 79 1 80 -2 80 5 79 -5 80 -3 +79 -6 80 -2 80 3 79 -6 80 -1 79 -6 80 -2 79 -9 +0 -1572 627 3100 66 MP +PP +/c9 { 0.800000 0.106667 0.000000 sr} bdef +c9 +-5014 0 0 2503 80 3 80 -42 79 -65 80 -40 79 -28 80 -27 +80 -20 79 -60 80 43 79 -40 80 -54 80 -21 79 -54 80 16 +79 -41 80 -20 79 -49 80 -12 80 -47 79 2 80 -39 79 -10 +80 -33 80 -7 79 -10 80 -45 79 -7 80 -22 80 -39 79 -7 +80 -11 79 22 80 -60 79 1 80 -21 80 2 79 -14 80 -3 +79 23 80 -17 80 -6 79 -11 80 -1 79 -11 80 -1 80 -4 +79 4 80 4 79 -10 80 -9 79 1 80 -2 80 5 79 -5 +80 -3 79 -6 80 -2 80 3 79 -6 80 -1 79 -6 80 -2 +79 -9 0 -1572 627 3100 67 MP stroke +/c10 { 0.509804 1.000000 0.000000 sr} bdef +c10 +0 2499 80 2 80 -41 79 -65 80 -40 79 -27 80 -26 80 -21 +79 -61 80 43 79 -40 80 -52 80 -21 79 -54 80 15 79 -40 +80 -21 79 -48 80 -11 80 -48 79 1 80 -37 79 -10 80 -34 +80 -6 79 -10 80 -45 79 -8 80 -21 80 -39 79 -8 80 -10 +79 21 80 -59 79 0 80 -20 80 2 79 -14 80 -3 79 23 +80 -17 80 -6 79 -10 80 -2 79 -11 80 -1 80 -4 79 4 +80 4 79 -9 80 -9 79 0 80 -2 80 6 79 -6 80 -2 +79 -6 80 -3 80 3 79 -6 80 0 79 -6 80 -3 79 -9 +0 -1571 627 3100 66 MP +PP +/c11 { 0.407843 0.800000 0.000000 sr} bdef +c11 +-5014 0 0 2499 80 2 80 -41 79 -65 80 -40 79 -27 80 -26 +80 -21 79 -61 80 43 79 -40 80 -52 80 -21 79 -54 80 15 +79 -40 80 -21 79 -48 80 -11 80 -48 79 1 80 -37 79 -10 +80 -34 80 -6 79 -10 80 -45 79 -8 80 -21 80 -39 79 -8 +80 -10 79 21 80 -59 79 0 80 -20 80 2 79 -14 80 -3 +79 23 80 -17 80 -6 79 -10 80 -2 79 -11 80 -1 80 -4 +79 4 80 4 79 -9 80 -9 79 0 80 -2 80 6 79 -6 +80 -2 79 -6 80 -3 80 3 79 -6 80 0 79 -6 80 -3 +79 -9 0 -1571 627 3100 67 MP stroke +/c12 { 0.000000 0.721569 1.000000 sr} bdef +c12 +0 2206 80 -4 80 -44 79 -49 80 -30 79 -6 80 -26 80 -21 +79 -39 80 19 79 -27 80 -49 80 -7 79 -41 80 2 79 -28 +80 -17 79 -26 80 -18 80 -27 79 -8 80 -24 79 -8 80 -19 +80 -22 79 -8 80 -23 79 -13 80 -21 80 -18 79 -17 80 -8 +79 -6 80 -19 79 -6 80 -6 80 -2 79 -5 80 1 79 1 +80 -6 80 -4 79 -2 80 -2 79 -2 80 -2 80 -2 79 0 +80 5 79 -8 80 -1 79 0 80 -2 80 2 79 -2 80 -1 +79 -1 80 1 80 0 79 -3 80 0 79 0 80 -2 79 -1 +0 -1504 627 3100 66 MP +PP +/c13 { 0.000000 0.577255 0.800000 sr} bdef +c13 +-5014 0 0 2206 80 -4 80 -44 79 -49 80 -30 79 -6 80 -26 +80 -21 79 -39 80 19 79 -27 80 -49 80 -7 79 -41 80 2 +79 -28 80 -17 79 -26 80 -18 80 -27 79 -8 80 -24 79 -8 +80 -19 80 -22 79 -8 80 -23 79 -13 80 -21 80 -18 79 -17 +80 -8 79 -6 80 -19 79 -6 80 -6 80 -2 79 -5 80 1 +79 1 80 -6 80 -4 79 -2 80 -2 79 -2 80 -2 80 -2 +79 0 80 5 79 -8 80 -1 79 0 80 -2 80 2 79 -2 +80 -1 79 -1 80 1 80 0 79 -3 80 0 79 0 80 -2 +79 -1 0 -1504 627 3100 67 MP stroke +/c14 { 1.000000 0.929412 0.000000 sr} bdef +c14 +0 564 80 -25 80 -6 79 -1 80 -12 79 -5 80 -5 80 5 +79 -12 80 0 79 9 80 -22 80 -6 79 -9 80 3 79 -1 +80 -5 79 -6 80 -5 80 -4 79 -7 80 -3 79 -2 80 2 +80 -8 79 -1 80 -6 79 -3 80 -4 80 -3 79 -6 80 2 +79 -5 80 -4 79 -1 80 -3 80 0 79 -3 80 1 79 0 +80 -3 80 -1 79 0 80 -1 79 -1 80 0 80 -1 79 1 +80 0 79 -1 80 -1 79 0 80 -1 80 0 79 0 80 0 +79 -1 80 0 80 0 79 0 80 0 79 0 80 0 79 -1 +0 -392 627 3100 66 MP +PP +/c15 { 0.800000 0.743529 0.000000 sr} bdef +c15 +-5014 0 0 564 80 -25 80 -6 79 -1 80 -12 79 -5 80 -5 +80 5 79 -12 80 0 79 9 80 -22 80 -6 79 -9 80 3 +79 -1 80 -5 79 -6 80 -5 80 -4 79 -7 80 -3 79 -2 +80 2 80 -8 79 -1 80 -6 79 -3 80 -4 80 -3 79 -6 +80 2 79 -5 80 -4 79 -1 80 -3 80 0 79 -3 80 1 +79 0 80 -3 80 -1 79 0 80 -1 79 -1 80 0 80 -1 +79 1 80 0 79 -1 80 -1 79 0 80 -1 80 0 79 0 +80 0 79 -1 80 0 80 0 79 0 80 0 79 0 80 0 +79 -1 0 -392 627 3100 67 MP stroke +/c16 { 0.984314 0.000000 1.000000 sr} bdef +c16 +0 118 80 -16 80 -4 79 -2 80 -8 79 -1 80 -4 80 7 +79 -12 80 0 79 9 80 -21 80 -3 79 -7 80 3 79 -1 +80 -3 79 -5 80 -2 80 -4 79 -6 80 -2 79 -1 80 4 +80 -6 79 0 80 -2 79 -4 80 -1 80 -1 79 -4 80 4 +79 -5 80 -1 79 -1 80 -3 80 0 79 -3 80 1 79 -1 +80 -1 80 -1 79 -1 80 -1 79 -1 80 0 80 0 79 0 +80 -1 79 0 80 -1 79 0 80 0 80 -1 79 0 80 0 +79 0 80 0 80 0 79 0 80 0 79 0 80 0 79 0 +0 -4 627 3100 66 MP +PP +/c17 { 0.787451 0.000000 0.800000 sr} bdef +c17 +-5014 0 0 118 80 -16 80 -4 79 -2 80 -8 79 -1 80 -4 +80 7 79 -12 80 0 79 9 80 -21 80 -3 79 -7 80 3 +79 -1 80 -3 79 -5 80 -2 80 -4 79 -6 80 -2 79 -1 +80 4 80 -6 79 0 80 -2 79 -4 80 -1 80 -1 79 -4 +80 4 79 -5 80 -1 79 -1 80 -3 80 0 79 -3 80 1 +79 -1 80 -1 80 -1 79 -1 80 -1 79 -1 80 0 80 0 +79 0 80 -1 79 0 80 -1 79 0 80 0 80 -1 79 0 +80 0 79 0 80 0 80 0 79 0 80 0 79 0 80 0 +79 0 0 -4 627 3100 67 MP stroke +gr + +c17 +0 sg +2906 3388 mt +(nr. cores) s + 310 1753 mt -90 rotate +(s) s +90 rotate +2720 249 mt +(Total task times) s + 610 3143 mt +( ) s +5625 386 mt +( ) s +1 sg +0 772 1341 0 0 -772 686 1176 4 MP +PP +-1341 0 0 772 1341 0 0 -772 686 1176 5 MP stroke +4 w +DO +SO +6 w +0 sg + 686 1176 mt 2027 1176 L + 686 404 mt 2027 404 L + 686 1176 mt 686 404 L +2027 1176 mt 2027 404 L + 686 1176 mt 2027 1176 L + 686 1176 mt 686 404 L + 686 1176 mt 2027 1176 L + 686 404 mt 2027 404 L + 686 1176 mt 686 404 L +2027 1176 mt 2027 404 L +1148 536 mt +(tCOM) s +gs 686 404 1342 773 rc +c8 +0 110 355 0 0 -110 757 548 4 MP +PP +c9 +-355 0 0 110 355 0 0 -110 757 548 5 MP stroke +gr + +c9 +0 sg +1148 684 mt +(tPAIR_PC) s +gs 686 404 1342 773 rc +c10 +0 111 355 0 0 -111 757 697 4 MP +PP +c11 +-355 0 0 111 355 0 0 -111 757 697 5 MP stroke +gr + +c11 +0 sg +1148 833 mt +(tPAIR_PP) s +gs 686 404 1342 773 rc +c12 +0 111 355 0 0 -111 757 845 4 MP +PP +c13 +-355 0 0 111 355 0 0 -111 757 845 5 MP stroke +gr + +c13 +0 sg +1148 981 mt +(tSELF) s +gs 686 404 1342 773 rc +c14 +0 111 355 0 0 -111 757 993 4 MP +PP +c15 +-355 0 0 111 355 0 0 -111 757 993 5 MP stroke +gr + +c15 +0 sg +1148 1129 mt +(qsched_gettask) s +gs 686 404 1342 773 rc +c16 +0 110 355 0 0 -110 757 1141 4 MP +PP +c17 +-355 0 0 110 355 0 0 -110 757 1141 5 MP stroke +gr + +c17 + +end %%Color Dict + +eplot +%%EndObject + +epage +end + +showpage + +%%Trailer +%%EOF diff --git a/paper/figures/BH_times.pdf b/paper/figures/BH_times.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1eaf65e17544c0109275e5e9c13cf0d73d4d3b60 Binary files /dev/null and b/paper/figures/BH_times.pdf differ diff --git a/paper/figures/Resources.pdf b/paper/figures/Resources.pdf index 6bbde38d70aeaee0f086196c7445779ec2fbc9e8..6c050fef780a82d2b8a4e99b3191dea9df41d149 100644 Binary files a/paper/figures/Resources.pdf and b/paper/figures/Resources.pdf differ diff --git a/paper/figures/Resources.svg b/paper/figures/Resources.svg index f4b97d34a89d9fa6d7a037ab6478d085bc8dfe11..71724d8f716f4719e3bffee6e9040bfdfe17bde9 100644 --- a/paper/figures/Resources.svg +++ b/paper/figures/Resources.svg @@ -78,7 +78,7 @@ style="fill:none;stroke:#000000;stroke-width:3.11821023;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" transform="matrix(1,0,-0.76604228,0.64279018,0,0)" /> <rect - style="fill:none;stroke:#000000;stroke-width:2.49456818;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + style="fill:#ff2200;stroke:#000000;stroke-width:2.49456817999999991;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1" id="rect4977" width="80" height="37.337223" @@ -110,7 +110,7 @@ y="830.84991" transform="matrix(1,0,-0.76604228,0.64279018,0,0)" /> <rect - style="fill:#ff0000;fill-opacity:1;stroke:#000000;stroke-width:2.5;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + style="fill:#ff2200;fill-opacity:1;stroke:#000000;stroke-width:2.50000000000000000;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" id="rect5505" width="30" height="30" @@ -186,7 +186,7 @@ y="936.63855" /> <rect transform="matrix(1,0,-0.76604228,0.64279018,0,0)" - style="fill:none;stroke:#000000;stroke-width:2.49456818;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + style="fill:#ff2200;stroke:#000000;stroke-width:2.49456817999999991;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1" id="rect5711" width="40" height="18.668612" @@ -199,7 +199,7 @@ height="18.668612" width="40" id="rect5713" - style="fill:none;stroke:#000000;stroke-width:2.49456818;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> + style="fill:#ff2200;stroke:#000000;stroke-width:2.49456817999999991;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0;fill-opacity:1" /> <rect y="592.36218" x="400" @@ -208,7 +208,7 @@ id="rect5715" style="fill:none;stroke:#000000;stroke-width:2.5;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> <rect - style="fill:#ff0000;fill-opacity:1;stroke:#000000;stroke-width:2.5;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" + style="fill:#ff2200;fill-opacity:1;stroke:#000000;stroke-width:2.50000000000000000;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" id="rect5717" width="30" height="30" @@ -220,7 +220,7 @@ height="30" width="30" id="rect5719" - style="fill:#ff0000;fill-opacity:1;stroke:#000000;stroke-width:2.5;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> + style="fill:#ff2200;fill-opacity:1;stroke:#000000;stroke-width:2.50000000000000000;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" /> <rect style="fill:none;stroke:#000000;stroke-width:2.5;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" id="rect5721" diff --git a/paper/paper.tex b/paper/paper.tex index f4d5fdf999d0523c98a56361c44e016a6b51bae4..f079c4ccb4a52410a11efa3809983491654312d7 100644 --- a/paper/paper.tex +++ b/paper/paper.tex @@ -85,7 +85,16 @@ % currently affiliated with NASA. \begin{abstract} -Bla. +This paper describes QuickSched, a compact and efficient Open-Source +C-language library for task-based shared-memory parallel programming. +QuickSched extends the standard dependency-only scheme of task-based +programming with the concept of task conflicts, i.e.~sets of tasks +that can be executed in any order, yet not concurrently. +These conflicts are modelled using exclusively lockable +hierarchical resources. +The scheduler is shown to perform and scale well on a 64-core parallel +shared-memory machine for two example problems: A tiled QR +decomposition and a task-based Barnes-Hut tree code. \end{abstract} \category{???}{Computing methodologies}{Shared memory algorithms \and Concurrent algorithms} @@ -108,7 +117,7 @@ QuickSched: Task-based parallelism with dependencies and conflicts.} % The remaining information (journal title, volume, article number, date, etc.) is 'auto-generated'. \begin{bottomstuff} -This work was supported by a Durham Universiy Seedcorn Grant. +This work was supported by a Durham University Seedcorn Grant. Author's address: P. Gonnet, School of Engineering and Computing Sciences, Durham University, South Road, Durham, DH1 3LE, United Kingdom. @@ -120,7 +129,7 @@ Durham University, South Road, Durham, DH1 3LE, United Kingdom. \section{Introduction} Task-based parallelism is a conceptually simple paradigm for -shared-memory paralelism in which a computation is broken-down +shared-memory parallelism in which a computation is broken-down into a set of inter-dependent tasks which are executed concurrently. Task dependencies are used to model the flow of data between @@ -132,15 +141,14 @@ respectively of a Directed Acyclic Graph (DAG) which can be traversed in topological order, executing the tasks at the nodes on the way down. -This computational model is trival to parallelize. +This computational model is trivial to parallelize. Given a set of inter-dependent tasks and a set of computational threads, each thread repeatedly selects a task with no unsatisfied dependencies from the DAG and executes it. If no tasks are available, the thread waits until any other thread finishes executing a task, thus potentially releasing new tasks, or until all tasks in the DAG have been executed. - -\fig{Tasks} shows the DAG for a set of tasks. +\fig{Tasks} shows such a DAG for a set of tasks. The arrows indicate the direction of the dependency, i.e.~an arrow from task $A$ to task $B$ indicates that task $B$ depends on task $A$. @@ -155,9 +163,11 @@ and can be executed by any other computational thread. The arrows indicate the direction of the dependency, i.e.~an arrow from task $A$ to task $B$ indicates that task $B$ depends on task $A$. - Tasks $A$, $G$, and $J$ have no unsatisifed dependencies and + Tasks $A$, $G$, and $J$ have no unsatisfied dependencies and can therefore be executed. - Once task $G$ has completed, tasks $F$ and $H$ become available.} + Once task $G$ has completed, tasks $F$ and $H$ become available, + and task $E$ only becomes available once both tasks $D$ and $F$ + have completed.} \label{fig:Tasks} \end{figure} @@ -172,11 +182,16 @@ Although simple to use, this implicit dependency management limits the types of DAGs that can be represented, i.e.~for the example in \fig{Tasks}, using such a spawning model would create implicit dependencies between the lowest-level -tasks $C$, $E$, and $K$. +tasks $C$, $E$, and $K$.\footnote{ +The main thread spawns tasks $A$, $G$ and $J$, $A$ spawns $B$ and $D$, +$G$ spawns $F$, $H$, and then $I$, $B$ spawns $C$. +The main thread then has to {\tt sync} for $A$, $G$, and $J$, +and thus implicitly all their spawned tasks, before executing +$E$ and $K$.} In SMP superscalar \cite{ref:Perez2008}, StarPU \cite{ref:Augonnet2011}, QUARK \cite{ref:Yarkhan2011}, and KAAPI \cite{ref:Gautier2007} -the programmer spcifies +the programmer specifies what shared data each task will access, and how that data will be accessed, e.g.~read, write, or read-write access. The dependencies between tasks are then generated @@ -185,7 +200,7 @@ data must be accessed and updated in the order in which the tasks are generated. StarPU also provides an interface for specifying additional dependencies explicitly. -Intel's Threding Building Blocks (TBB) +Intel's Threading Building Blocks (TBB) \cite{ref:Reinders2010} provide task-based parallelism using C++ templates. Dependencies are handled either by explicitly waiting @@ -196,7 +211,8 @@ Finally, the very popular OpenMP standard provides some basic support for spawning tasks, similar to Cilk, as of version 3.0 \cite{ref:OpenMP2008}. OmpSs \cite{ref:Duran2011} extends this scheme with automatic -dependency generation as in SMP superscalar, along with +dependency generation as in SMP superscalar, of which it +is a direct descendant, along with the ability to explicitly wait on certain tasks. In all of these systems, the tasks are only aware of a single @@ -208,24 +224,25 @@ Consider the case of two tasks that update some shared resource in an order-independent way, e.g. when accumulating a result in a shared variable, or exclusively writing to an output file. In order to avoid concurrent access to that resource, it is -imperative that the execution of both tasks does not overlap, -yet the order in which the tasks are exectued does not matter. -In the following, such a relationship will be refered to +imperative that the execution of both tasks do not overlap, +yet the order in which the tasks are executed is irrelevant. +In the following, such a relationship will be referred to as a ``conflict'' between two tasks. \fig{TaskConflicts} shows a task graph with conflicting tasks joined by thick dashed lines. -None of tasks $F$, $H$, and $I$ cannot be executed concurrently, +None of tasks $F$, $H$, and $I$ can be executed concurrently, i.e. they must be serialized, yet in no particular order. In dependency-only systems, such conflicts can be modelled with dependencies, which enforce a pre-determined arbitrary ordering on conflicting tasks. -This, however, imposes unnecessary restriction on the order -in which tasks can be scheduled, especially in the presence +This unnecessary restriction on the order +in which tasks can be scheduled can severely limit the +parallelizability of a computation, especially in the presence of multiple conflicts per task. Both \citeN{ref:Ltaief2012} and \citeN{ref:Agullo2013} note this problem in their respective implementations of the Fast Multipole -Method, in which forces computed in different tasks are +Method (FMM), in which forces computed in different tasks are accumulated on a set of particles. Conflicts can be modeled directly as exclusive locks on a shared resource @@ -237,11 +254,9 @@ While task $F$ is being executed, neither $H$ nor $I$ can lock the same resource, and therefore will not execute until task $F$ is done and the lock has been released. -This paper presents QuickSched, a framework for task-based -parallel programming with constraints. In order to be of practical use, and to scale well with modern -multi-core shared-memory architectures, the task scheduler -must be: +multi-core shared-memory architectures, any task scheduler +implementing constraints must be: \begin{itemize} \item {\em Correct}: All constraints, i.e.~dependencies and conflicts, must be correctly enforced, @@ -251,7 +266,7 @@ must be: sets of data should be preferentially executed on the same core to preserve memory/cache locality as far as possible, and \item {\em Parallel-efficient}: Tasks should be executed in an order - that sufficient work is available for all computatoinal + that sufficient work is available for all computational threads at all times. \end{itemize} \noindent This paper presents QuickSched, a framework for task-based @@ -263,13 +278,15 @@ Section~4 presents two test-cases: \begin{enumerate} \item The tiled QR decomposition described in \cite{ref:Buttari2009} and for - which the QUARK scheduler was originally developed, + which the QUARK scheduler was originally developed, and \item A task-based Barnes-Hut tree-code to compute the - gravitational N-body problem, + gravitational N-body problem similar to the FMM codes + of \citeN{ref:Ltaief2012} and \citeN{ref:Agullo2013}, \end{enumerate} These real-world examples show how QuickSched can be used in practice, and can be used to assess its efficiency. -Section~5 concludes with some general observations and future work. +Section~5 concludes with some general observations and future work +directions. \begin{figure} \centerline{\epsfig{file=figures/TaskConflicts.pdf,width=0.5\textwidth}} @@ -286,7 +303,7 @@ Section~5 concludes with some general observations and future work. \section{Data Structures and Algorithms} -The QuickSched task scheduler consits of four main +The QuickSched task scheduler consist of four main objects types: {\em task}, {\em resource}, {\em scheduler}, and {\em queue}. @@ -311,7 +328,7 @@ where and when, respectively. \label{fig:QSched} \end{figure} -The division of labour regarding {\em correctness} +The division of labor regarding {\em correctness} between the scheduler and the queue objects is illustrated in \fig{QSched}. The scheduler holds the tasks and is in charge @@ -386,7 +403,6 @@ if task $B$ depends on task $A$, then task $A$ {\em unlocks} task $B$. The unlocks therefore follow the direction of the arrows in \figs{Tasks}{TaskConflicts}. - Conversely, {\tt wait} is the number of unresolved dependencies associated with this task, i.e.~the number of unexecuted tasks that unlock this task. @@ -403,7 +419,7 @@ for ( k = 0 ; k < N ; k++ ) \end{lstlisting} \end{minipage}\end{center} -The {\tt locks} field points to the first element of +The {\tt locks} field of each task points to the first element of an array of {\tt nr\_locks} pointers to {\em resources} for which exclusive locks must be obtained for the task to execute. @@ -435,8 +451,8 @@ if ( k < N ) \end{lstlisting} \end{minipage}\end{center} \noindent where the array {\tt top} contains the task indices -in reverse topological order. -The test in line~10 is a convenient check if the tasks and teir +in topological order. +The test in line~10 is a convenient check if the tasks and their dependencies actually do form an acyclic graph. The weights themselves are then computed as follows \begin{center}\begin{minipage}{0.9\textwidth} @@ -452,13 +468,13 @@ for ( k = N-1 ; k >= 0 ; k-- ) { \end{minipage}\end{center} \noindent where the tasks are traversed in reverse topological order, computing the recursive weight as the sum of the -task cost and the maximum weight of the tasks it unlocks, -and recomputing the task waits at the same time. +task cost and the maximum weight of the tasks it unlocks (line~6), +and recomputing the task waits at the same time (line~3). \subsection{Resources} -The data structure for the resources is as follows: +Resources consist of the following data structure: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} struct resource { @@ -475,7 +491,7 @@ that are themselves subsets of larger resources. This can be useful, e.g.~in the context of particle simulations described in the next section, where particles are sorted into hierarchical cells which are used at different levels. -The owner field is the ID of the queue to which this +The {\tt owner} field is the ID of the queue to which this resource has been preferentially assigned. The {\tt lock} field is either {\tt 0} or {\tt 1} and indicates @@ -483,7 +499,7 @@ whether this resource is currently in use, i.e.~{\em locked}. To avoid race conditions, this value should only be tested and set using atomic instructions. The {\tt hold} field is a counter indicating how many -sub-resources of the current resouce are locked. +sub-resources of the current resource are locked. If a resource's hold counter is not zero, then it is {\em held} and cannot be locked. Likewise, if a resource is locked, it cannot be held @@ -491,11 +507,11 @@ Likewise, if a resource is locked, it cannot be held \begin{figure} \centerline{\epsfig{file=figures/Resources.pdf,width=0.6\textwidth}} - \caption{A hierarchicy of cells (left) and the hierarchy of + \caption{A hierarchy of cells (left) and the hierarchy of corresponding hierarchical resources at each level. Each square on the right represents a single resource, and arrows indicate the resource's parent. - Resources coloured red are locked, resources coloured orange + Resources colored red are locked, resources colored orange are held, where the number in the square indicates the value of the hold counter.} \label{fig:Resources} @@ -577,7 +593,8 @@ void resource_unlock ( struct resource *r ) { \end{lstlisting} \end{minipage}\end{center} \noindent where the resource itself is unlocked (line~3) -and the hold counter of its parents is decremented (lines~4--5). +and the hold counters of all of its hierarchical parents +are decremented (lines~4--5). \subsection{Queues} @@ -587,28 +604,29 @@ to find the task with maximum weight whose resources can all be locked, and to do so as efficiently as possible. One possible strategy would be to maintain an array of tasks -sorted by their weights, and to trverse that list in descending +sorted by their weights, and to traverse that list in descending order, trying to lock the resources of each task, until a lockable task is found, or returning a failure otherwise. Although this would return the best possible task, it requires maintaining a sorted list in which inserting or removing an entry is in \oh{n} for $n$ elements. - -Using an unsorted array requires only \oh{1} operations for -insertion and deletion, but is undesireable as it completely +Using an unsorted array would require only \oh{1} operations for +insertion and deletion, but is undesirable as it completely ignores the task weights. As a compromise, the queue stores the tasks in an array -organized as a max-heap, with the task with maximum weight +organized as a max-heap, i.e.~where the $k$th entry is ``larger'' +than both the $2k+1$st and the $2k+2$nd entry, +with the task with maximum weight in the first position. -Maintainig this heap structure thus requires \oh{\log n} +Maintaining this heap structure thus requires \oh{\log n} operations for both insertion and deletion, i.e. for the bubble-up and trickle-down operations respectively. The array of tasks is then traversed as if it were sorted, returning the first task that can be locked. Although the first task in the array will be the task with -maximum weight, the following tasks are only losely ordered, +maximum weight, the following tasks are only loosely ordered, where the $k$th of $n$ tasks has a larger weight than at least $\lfloor n/k\rfloor -1$ other tasks. @@ -640,7 +658,7 @@ void queue_put ( struct queue *q , struct task *t ) { lock on the queue can be obtained. The task is added to the end of the heap array (line~3) and the heap order is fixed (line~4). -Before exiting, the lock on the queue is released (line~5). +Before returning, the lock on the queue is released (line~5). Obtaining a task from the queue can be implemented as follows: \begin{center}\begin{minipage}{0.9\textwidth} @@ -651,7 +669,7 @@ struct task *queue_get ( struct queue *q ) { while ( atomic_cas( q->lock , 0 , 1 ) != 0 ); for ( k = 0 ; k < q->count ; k++ ) { for ( j = 0 ; j < q->tasks[k]->nr_locks ; j++ ) - if ( !resource_lock( q->tasks[k]->lock[j] ) + if ( !resource_lock( q->tasks[k]->lock[j] ) ) break; if ( j < q->tasks[k]->nr_locks ) for ( j = j-1 ; j >= 0 ; j-- ) @@ -676,7 +694,7 @@ locking the resources of each task (lines~6--8). If any of these locks fail (line~9), the locks that were obtained are released (lines~10--11), otherwise, the traversal is aborted (line~13). -If all the locks on a task could be obtained (line~14), the +If all the locks on a task could be obtained (line~15), the task pointer is replaced by the last pointer in the heap (line~17) and the heap order is restored (line~18). Finally, the queue lock is released (line~19) and the locked task @@ -687,7 +705,7 @@ or, if no lockable task could be found, {\tt NULL} is returned. The scheduler object is used as the main interface to the QuickSched task scheduler, and as such contains the instances -other three object types: +of the other three object types: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} struct qsched { @@ -699,12 +717,12 @@ struct qsched { }; \end{lstlisting} \end{minipage}\end{center} -\noindent where\dots the only additional field {\tt waiting} is +\noindent where the only additional field {\tt waiting} is used to keep track of the number of tasks that have not been executed. Note that for brevity, and to avoid conflicts with the naming schemes of other standard libraries, the type name {\tt qsched} -is used. +is used for the scheduler data type. The tasks are executed as follows: \begin{center}\begin{minipage}{0.9\textwidth} @@ -727,17 +745,17 @@ void qsched_run ( qsched *s , void (*fun)( int , void * ) ) { fills the queues (line~1). For simplicity, OpenMP \cite{ref:Dagum1998}, which is available for most compilers, is used to create a parallel section -in which the code between lines~4 and~12 is executed +in which the code between lines~4 and~11 is executed concurrently. The parallel section consists of a loop (lines~7--10) in -which a task is acquired via the {\em execution function} -{\tt qsched\_gettask} -and its type and data are passed to a user-supplied execution -function. +which a task is acquired via {\tt qsched\_gettask} +and its type and data are passed to a user-supplied +{\em execution function} {\tt fun}. Once the task has been executed, it is returned to the -scheduler via the function {\tt qsched\_done}. +scheduler via the function {\tt qsched\_done}, i.e.~to +unlock its resources and unlock dependent tasks. The loop terminates when the scheduler runs out of tasks, -i.e.~when {\tt qstack\_gettask} returns {\tt NULL}, and +i.e.~when {\tt qsched\_gettask} returns {\tt NULL}, and the function exits once all the threads have exited their loops. @@ -758,7 +776,6 @@ void qsched_start ( qsched *s ) { \noindent where line~2 sets the {\tt unlocks}, {\tt locks}, and {\tt uses} pointers in the tasks. The operations in line~3 are described in \sect{tasks}. - The function {\tt qsched\_enqueue} tries to identify the best queue for a given task by looking at which queues last used the resources used and locked by the task, e.g.: @@ -781,8 +798,8 @@ void qsched_enqueue ( qsched *s , struct task *t ) { \end{minipage}\end{center} \noindent where the array {\tt score} keeps a count of the task resources ``owned'', or last used, by each queue. -In lines~9--12 the queue with the highest such score is -chosen on which the task is then put (line~13). +In lines~9--11 the queue with the highest such score is +chosen on which the task is then put (line~12). The function {\tt qsched\_gettask} fetches a task from one of the queues: @@ -811,7 +828,7 @@ struct task *qsched_gettask ( qsched *s , int qid ) { \end{lstlisting} \end{minipage}\end{center} \noindent where the parameter {\tt qid} is the index of the -prefered queue. +preferred queue. If the queue is empty, or all of the tasks in that queue had unresolved conflicts, the scheduler uses {\em work stealing} \cite{ref:Blumofe1999}, i.e.~it loops over all other queues @@ -820,17 +837,19 @@ in a random order (line~6) and tries to get a task from them If a task could be obtained from any queue and task re-owning is switched on (line~13), the resources it locks and uses are marked as now being owned -by the prefered queue (lines~14--17). +by the preferred queue (lines~14--17). Finally, the task, or {\tt NULL} if no task could be obtained, is returned. -The final step in a task's lifecycle is, on completion, -to unlock the tasks which depend on it. +The final step in a task's life cycle is, on completion, +to unlock the resources and tasks which depend on it. This is handled by the function {\tt qsched\_done}: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} void qsched_done ( qsched *s , struct task *t ) { int k; + for ( k = 0 ; k < t->nr_locks ; k++ ) + resource_unlock( t->locks[k] ); for ( k = 0 ; k < t->nr_unlocks ; k++ ) if ( atomic_dec( &t->unlocks[k]->wait ) == 1 ) qsched_enqueue( s , t->unlocks[k] ); @@ -839,11 +858,11 @@ void qsched_done ( qsched *s , struct task *t ) { \end{lstlisting} \end{minipage}\end{center} \noindent If any of the unlocked tasks' wait counters -goes to zero (line~4), then the unlocked task is ready to +goes to zero (line~6), then the unlocked task is ready to run and is immediately dispatched via {\tt qsched\_enqueue}, as described earlier. Once all the dependent tasks have been unlocked, the -{\tt waiting} counter is decremented (line~6). +{\tt waiting} counter is decremented (line~8). \section{User Interface} @@ -852,12 +871,12 @@ The algorithms, as described in the previous section, have all been implemented as part of the Open-Source C-language QuickSched library.\footnote{\url{http://sourceforge.net/projects/quicksched/}} This section describes the interface functions and how they -are called, whereas the following section contains +are called, and the following section contains examples of how QuickSched can be used. As mentioned previously, the {\tt qsched} object is the main interface to the task scheduler. -As such, it provieds functionality for task and resource +As such, it provides functionality for task and resource creation, for assigning resources to tasks, either as locks or uses, and for assigning dependencies between tasks. The tasks and resources themselves are opaque to the @@ -902,7 +921,7 @@ The main functions for setting up the scheduler are: its handle. The owner field is the initial queue ID to which this resource should be assigned, or {\tt qsched\_owner\_none}. - The {\tt parent} field is the handle of the heirarchical parent of + The {\tt parent} field is the handle of the hierarchical parent of the new resource or {\tt qsched\_res\_none} if the resource has no hierarchical parent. \vspace{1mm} @@ -916,7 +935,7 @@ The main functions for setting up the scheduler are: \item {\tt void qsched\_adduse( struct qsched *s , qsched\_task\_t t , qsched\_res\_t res )} \\ Similar to {\tt qsched\_addlock}, yet the resource is only used and is not part of a conflict. - This information is used when assinging tasks to specific queues. + This information is used when assigning tasks to specific queues. \vspace{1mm} \item {\tt void qsched\_addunlock( struct qsched *s , qsched\_task\_t ta , qsched\_task\_t tb )} \\ Appends the task {\tt tb} to the list of tasks that the task {\tt ta} @@ -932,12 +951,12 @@ The main functions for setting up the scheduler are: \vspace{1mm} \end{itemize} -The library can be compiled to use either OpenMP or the +The library can be compiled to use OpenMP and/or the {\tt pthreads} library \cite{ref:Pthreads1995}. OpenMP is the default, but calling {\tt qsched\_init} with either the {\tt qsched\_flag\_yield} -or the {\tt qsched\_flag\_pthread} switches to using {\tt pthreads} -for the parallel loop. +or the {\tt qsched\_flag\_pthread} switches to using {\tt pthreads}, +if available, for the parallel loop. OpenMP has the advantage of being available for most compilers and also potentially providing some extra platform-specific @@ -949,7 +968,7 @@ any mechanism for yielding a thread if no tasks are available, i.e. the main loop in {\tt qsched\_gettask}, described in the previous section, will spin until a task becomes available. This may be a problem if other parts of the user application -are running in the background simultaneously. +are running concurrently in the background. Calling {\tt qsched\_init} with the {\tt qsched\_flag\_yield} forces the use of {\tt pthreads} and uses conditional variables to wait for a new task to be enqueued if obtaining a task @@ -960,16 +979,16 @@ processes. \section{Validation} -This section presents two test cases showing both +This section presents two test cases showing how QuickSched can be used in real-world applications, and providing benchmarks to assess its efficiency and scalability. The first test is the tiled QR decomposition originally -from \citeN{ref:Buttari2009}, which has been used as a benchmark +described in \citeN{ref:Buttari2009}, which has been used as a benchmark by other authors \cite{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}. -This example only requires dependencies and is presented -only as a point of comparison to existing task-based -parallel infrastructures. +This example only requires dependencies and is presented +as a point of comparison to existing task-based parallel +programming infrastructures. The second example is a Barnes-Hut tree-code, a problem similar to the Fast Multipole Method described in both @@ -980,12 +999,16 @@ via hierarchical resources, can be useful. The source code of both examples is distributed with the QuickSched library, along with scripts to run the benchmarks and generate the plots used in the following. +All examples were compiled with gcc v.\,4.8.1 using the +{\tt -O2 -march=native} flags and run on a +tests were run on a 64-core AMD Opteron 6376 machine running +at 2.6\,GHz. \subsection{Task-Based QR Decomposition} \citeN{ref:Buttari2009} introduced the concept of using task-based -parallelsim for tile-based algorithms in numerical linear algebra, +parallelism for tile-based algorithms in numerical linear algebra, presenting parallel codes for the Cholesky, LU, and QR decompositions. These algorithms are now part of the PLASMA and MAGMA @@ -998,8 +1021,8 @@ task scheduler. \centerline{\epsfig{file=figures/QR.pdf,width=0.8\textwidth}} \caption{Task-based QR decomposition of a matrix consisting of $4\times 4$ tiles. - Each circle represents a tile, and its colour represents - the type of taks on that tile at that level. + Each circle represents a tile, and its color represents + the type of task on that tile at that level. Empty circles have no task associated with them. The arrows represent dependencies at each level, and tasks at each level also implicitly depend on the @@ -1018,7 +1041,7 @@ and $k$ is its level: \begin{center} \begin{tabular}{llll} - Task & cond. & depends on task & locks tile \\ + Task & where & depends on task(s) & locks tile(s) \\ \hline \epsfig{file=figures/TaskRed.pdf,height=9pt} DGEQRF & $i=j=k$ & $(i,j,k-1)$ & $(i,j)$ \\ \epsfig{file=figures/TaskGreen.pdf,height=9pt} DLARFT & $i=k$, $j>k$ & $(i,j,k-1)$, $(k,k,k)$ & $(i,j)$ \\ @@ -1036,9 +1059,6 @@ $(i,j,k-1)$ for $k>1$. Each task also modifies its own tile $(i,j)$, and the DTSQRF task additionally modifies the lower triangular part of the $(j,j)$th tile. -Setting up the dependencies and locks for a matrix of -$m\times n$ tiles is implemented as shown in \fig{CodeQR}, - \begin{figure} \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} @@ -1110,11 +1130,11 @@ where the $m\times n$ matrix {\tt tid} stores the handles of the last task at position $(i,j)$ and is initialized with empty tasks (line~7). Similarly, {\tt rid} stores the handles of the resources for each -tile of the matrix, allocated in line~8. +tile of the matrix, which are allocated in line~8. The following loops mirror the task generation described in Algorithm~2 of \citeN{ref:Buttari2009}. -E.g.~for each level {\tt k} (line~10), a DGEQRF task is created +For each level {\tt k} (line~10), a DGEQRF task is created for tile $(k,k)$ (lines~13--14). A lock is added for the newly created task on the resource associated with the $(k,k)$th tile (line~15). @@ -1125,7 +1145,7 @@ the new (line~17), and the new task is stored in {\tt tid} The remaining tasks are generated in the same way, with their respective locks and dependencies. -The execution function for these tasks simply calls the apropriate +The execution function for these tasks simply calls the appropriate kernels on the matrix tiles given by the task data: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} @@ -1152,7 +1172,7 @@ void exec_fun ( int type , void *data ) { \end{lstlisting} \end{minipage}\end{center} \noindent where {\tt A} is the matrix over which the QR -decmposition is executed. +decomposition is executed. The QR decomposition was computed for a $2048\times 2048$ random matrix using tiles of size $64\times 64$ using QuickSched @@ -1165,10 +1185,8 @@ as part of his MSc thesis in Computer Science at Durham University. For this matrix, a total of 11440 tasks with 32240 dependencies were generated. -All tests were run on a 64-core AMD Opteron 6376 machine running -at 2.6\,GHz. For these tests, OpenMP parallelism and resource re-owning -were used. +were used with one queue per core. The QR decomposition was computed 10 times for each number of cores, and the average thereof taken for the scaling and efficiency results in \fig{QRResults}. @@ -1177,17 +1195,42 @@ The timings are for {\tt qsched\_run}, including the cost of Setting up the scheduler, tasks, and resources took, in all cases, an average of 7.2\,ms. +The same decomposition was implementing using OmpSs v.\,1.99.0, +calling the kernels directly using {\tt \#pragma omp task} +annotations with the respective dependencies. +The scaling and efficiency relative to QuickSched are +shown in \fig{QRResults} as well. +The difference in timings is the result of the different +task scheduling policies, as well as a smaller lag between the +individual tasks, as shown in \fig{QRTasks}, +for a smaller $1024\times 1024$ matrix on 16 cores of the +same hardware. +The most visible difference between both schedulers is that +the DGEQRF tasks (in red) are scheduled as soon as they +become available in QuickSched, thus preventing bottlenecks +near the end of the computation. + \begin{figure} \centerline{\epsfig{file=figures/QR_scaling.pdf,width=0.9\textwidth}} \caption{Strong scaling and parallel efficiency of the tiled QR decomposition computed over a $2048\times 2048$ matrix with tiles of size $64\times 64$. - The QR decomposition takes 233\,ms, achieving 73\% parallel - efficiency, over all 64 cores. + The QR decomposition with QuickSched takes 233\,ms, + achieving 73\% parallel efficiency, over all 64 cores. + The scaling and efficiency for OmpSs are computed relative to QuickSched. } \label{fig:QRResults} \end{figure} +\begin{figure} + \centerline{\epsfig{file=figures/tasks_qr.pdf,width=0.9\textwidth}} + \centerline{\epsfig{file=figures/tasks_qr_ompss.pdf,width=0.9\textwidth}} + \caption{Task scheduling in QuickSched (above) and OmpSs (below) + for a $1024\times 1024$ matrix on 16 cores. + The task colors correspond to those in \fig{QR}.} + \label{fig:QRTasks} +\end{figure} + \subsection{Task-Based Barnes-Hut N-Body Solver} @@ -1196,18 +1239,18 @@ solution of an $N$-body problem, i.e.~computing all the pairwise interactions between a set of $N$ particles, in \oh{N\log N} operations, as opposed to the \oh{N^2} naive direct computation. - The algorithm is based on a recursive octree decomposition: Starting from a cubic cell containing all the particles, -the cell is recursivel bisected along all three spatial dimensions, +the cell is recursively bisected along all three spatial dimensions, resulting in eight sub-cells, until the number of particles per cell is smaller than some limit $n_\mathsf{max}$. The particle interactions can also be formulated recursively: -Given a particle an a cell of particles, if the particle and cell +Given a particle and a set of particles in a cell, +if the particle and cell are sufficiently well separated, the particle-cell interactions are approximated by interacting the particle with the cell's -centre of mass. +center of mass. If the particle and the cell are too close, and the cell has sub-cells, i.e.~it contained more than $n_\mathsf{max}$ particles and was split in the recursive octree decomposition, @@ -1220,7 +1263,8 @@ it is in the same cell. This operation is performed for each particle, starting with the root-level cell containing all the particles. -The cells have the following structure: +The cells themselves are implemented using the following +data structure: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} struct cell { @@ -1236,8 +1280,8 @@ struct cell { \noindent where {\tt loc} and {\tt h} are the location and size of the cell, respectively. The {\tt com} and {\tt mass} fields represent the cell's -center of mass, which will be used in the interactions. -The {\tt res} filed is the hieararchical resource representing +center of mass, which will be used in the particle-cell interactions. +The {\tt res} filed is the hierarchical resource representing the cell's particles, and it is the parent resource of the cell progeny's {\tt res}. Similarly, the {\tt task\_com} is a task handle to @@ -1245,7 +1289,7 @@ compute the center of mass of the cell's particles, and it depends on the {\tt task\_com} of all the progeny if the cell is split. {\tt parts} is a pointer to an array of {\tt count} -particle strutures, which contain all the particle +particle structures, which contain all the particle data of the form: \begin{center}\begin{minipage}{0.9\textwidth} \begin{lstlisting} @@ -1255,7 +1299,7 @@ struct part { }; \end{lstlisting} \end{minipage}\end{center} -\noindent i.e.~the particle position, accelleration, mass, +\noindent i.e.~the particle position, acceleration, mass, and ID, respectively. The particle data is sorted hierarchically, following the @@ -1275,7 +1319,7 @@ is in \oh{N\log N}. \centerline{\epsfig{file=figures/CellParts.pdf,width=0.9\textwidth}} \caption{Hierarchical ordering of the particle data structures (right) according to their cell (left). - Each cell has a pointer to the first of its particles (same colour + Each cell has a pointer to the first of its particles (same color as cells) in the same global parts array.} \label{fig:CellParts} \end{figure} @@ -1286,12 +1330,12 @@ types of tasks: \item {\em Self}-interactions in which all particles in a single cell interact with all other particles in the same cell, - \item {\em Particle-particle pair}-interactions in which - all particles in a pair of cells interact with all - particles in the opposite cell, and - \item {\em Particle-cell pair}-interactions in which + \item {\em Particle-particle} pair interactions in which + all particles in a cell interact with all + particles in another cell, + \item {\em Particle-cell} pair interactions in which all particles in one cell are interacted with the - center of mass of another cell. + center of mass of another cell, and \item {\em Center of mass} tasks, which compute the center of mass of a single cell either from the sum of the centers of mass of its sub-cells @@ -1365,9 +1409,9 @@ and {\tt NULL} as its two cell parameters. The function recurses as follows: \begin{itemize} \item If called with a single (line~6), split (line~7) cell, - recurse over all the cell's sub cells (line~9), and all - pairs of the cell's sub cells (line~11), - \item If called with a single unslplit cell (line~13), + recurse over all the cell's sub-cells (line~9), and all + pairs of the cell's sub-cells (line~11), + \item If called with a single unsplit cell (line~13), create a self-interaction task on that cell (line~14), \item If called with two cells that are sufficiently well separated (line~21), create two particle-cell pair @@ -1385,8 +1429,8 @@ The function recurses as follows: \noindent where every interaction task additionally locks the cells on which it operates (lines~16, 25, 30, and 42--43). -In order to reduce the number of tasks, and to prevent generating -too many very small tasks, the task generation only recurses +In order to prevent generating +a large number of very small tasks, the task generation only recurses if the cells contain more than a minimum number $n_\mathsf{task}$ of threads each (lines~7 and~34). The tasks themselves are then left to recurse over the sub-trees, @@ -1486,21 +1530,24 @@ void exec_fun ( int type , void *data ) { This Barnes-Hut tree-code was used to approximate the gravitational -N-Body problem for 1\,000\,000 particles with random coordinates the -parameters $n_\mathsf{max}=100$ and $n_\mathsf{task}=5000$. -Cell pairs were considered well separated if not adjecent. +N-Body problem for 1\,000\,000 particles with random coordinates +in $[0,1]^3$. +The parameters $n_\mathsf{max}=100$ and $n_\mathsf{task}=5000$ +were used to generate the tasks, and cell pairs were considered +well separated if not directly adjacent. Using the above scheme generated 161\,613 tasks, of which 512 self-interaction tasks, 18\,532 particle-particle interaction task, 105\,120 particle-cell interaction tasks, and 37\,449 center of mass tasks. -Additionally 179\,632 dependencies were generated, along with -142'696 locks on 37\,449 resources. +A total of 179\,632 dependencies were generated, along with +142\,696 locks on 37\,449 resources. -As with the previous example, all -tests were run on a 64-core AMD Opteron 6376 machine running -at 2.6\,GHz. For these tests, OpenMP parallelism was used and resource re-owning was switched off. +Resource ownership was attributed by dividing the global +{\tt parts} array by the number of queues and assigning each cell's +{\tt res} to the fraction of the {\tt parts} array to which +the first of its own {\tt parts} belong. The interactions computed 10 times for each number of cores, and the average thereof taken for the scaling and efficiency results in \fig{BHResults}. @@ -1513,7 +1560,7 @@ cases, an average of 51.3\,ms. \centerline{\epsfig{file=figures/BH_scaling.pdf,width=0.9\textwidth}} \caption{Strong scaling and parallel efficiency of the Barnes-Hut tree-code computed over 1\,000\,000 particles. - Solving the N-Body problem takes 3.5\,s, achieving 52\% parallel + Solving the N-Body problem takes 3.3\,s, achieving 60\% parallel efficiency, over all 64 cores. } \label{fig:BHResults} @@ -1523,35 +1570,105 @@ Unlike the QR decomposition, the results scale well only to 32 cores, achieving 90\% parallel efficiency, and then level off for increasing numbers of cores. This, however, is not a problem of the task-based parallel -algorith, or of QuickSched, but of the cache hierarchies +algorithm, or of QuickSched, but of the memory bandwidth of the underlying hardware. -On the AMD Opteron 6376, the cores are grouped into pairs -which each share a 2\,MB L2 cache. -Each group of four pairs, or eight cores, shares a common -6\,MB L3 cache. -It is the difference between the sum of the L2 caches and -the (smaller) L3 cache which causes problems. +\fig{BHTasks} shows the accumulated cost of each task type and of +QuickSched over the number of cores. +At 64 cores, the scheduler overheads account for only 4.7\% of +the total computational cost, whereas, +as of 30 cores, the cost of both pair types grow by up to +50\%. +This is most probably due to memory bandwidth restrictions, as +the cost of the self interaction tasks, which do twice as much +computation per memory access, only grow by up to 15\%. +\begin{figure} + \centerline{\epsfig{file=figures/BH_times.pdf,width=0.8\textwidth}} + \caption{Accumulated cost of each task type and of the overheads + associated with {\tt qsched\_gettask}. + As of $\sim 30$ cores, the cost of both pair interaction task + types grow by up to 50\%. + The cost of the self interactions, which entail twice as much + computation per memory access, grow only by at most 15\%. + The scheduler overheads make up less than 5\% of the total + time. + } + \label{fig:BHTimes} +\end{figure} -\section{Conclusions} -Main points/differences to other schedulers: +\section{Discussion and Conclusions} + +The task scheduler described in the previous sections, QuickSched, +differs from existing task-based programming schemes +in a number of ways. +The most obvious such difference is the addition of {\em conflicts}, +modeled using exclusive locks on hierarchical resources. +This extension to the standard dependencies-only model +of task-based parallelism allows for more complex task relations, +such as in the Barnes-Hut tree-code described earlier. + +Another significant difference is that the tasks, their +dependencies, and their conflicts must be described +{\em explicitly} before the parallel computation starts. +This as opposed to implicit dependencies generated +by task spawning, e.g.~as in Cilk, or to extracting the +dependencies from the task parameters, e.g.~in QUARK or OmpSs. +Explicitly defining dependencies has the advantage that +more elaborate dependency structures can be generated. +Furthermore, knowing the structure of the entire task +graph from the start of the computation provides valuable +information when scheduling the tasks, e.g.~using the +critical path along the dependencies to compute the +task weight. + +Finally, as opposed to the most other task-based +programming environments which rely on compiler extensions +and/or code pre-processors, QuickSched operates as a regular +C-language library, based on standard parallel functionality +provided by OpenMP and/or {\tt pthreads}. +This ensures a maximum amount of portability on existing +and future architectures. +The interfaces are also kept as simple +as possible in order to reduce the burden on the programmer +when implementing task-based codes. + +The QuickSched library itself is remarkably simple, consisting of +less than 3\,000 lines of code, including comments. +Both examples, which are distributed with QuickSched, +require less than 1\,000 lines of code each. + +In both examples, QuickSched performs extremely well, even +on a large number of shared-memory cores. +This performance is due, on the one hand, to the +division of labor between the scheduler and the queues, +and on the other hand due to the simple yet efficient +algorithms for task selection and resource locking. +The task weighting based on the length of the critical +path of the dependencies delivers, in the examples shown, +good parallel efficiency. + +There are several possible improvements to QuickSched which +have not been addressed in this paper. +The most obvious of which are the following: \begin{itemize} - \item Conflicts. - \item Specifying the entire task tree before execution - gives better scheduling. - \item Combination of locality, weight, and availability. - \item Compact implementation. + \item {\em Priorities}: The current implementation of + QuickSched does not take the resource locks into + account when selecting tasks in the queues, + \item {\em Work-stealing}: During work-stealing, the + queues are probed in a random order although + the total relative cost of the tasks in the queue, + as well as the length of their critical paths are + known, + \item {\em Costs}: The size of the resources used by + a task are currently not taken into account when + assigning it to the queues in {\tt qsched\_enqueue}, + or when approximating the cost of a task. \end{itemize} -Possible improvements -\begin{itemize} - \item Use number of locks as well when selecting from a queue. - \item Use total/max queue weight when selecting for work-stealing. - \item Use different resource sizes when computing queue score. - \item Dynamically add new tasks, but potentially wrecks havoc - with the weights. -\end{itemize} +QuickSched is distributed under the GNU Lesser General Public Licence +v\,3.0 and is available for download via +\url{http://quicksched.sourceforge.net}. % Acknowledgments