Skip to content
Snippets Groups Projects
Commit 22186ed1 authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

new figures and backup.

parent 704f8980
No related branches found
No related tags found
No related merge requests found
%!PS-Adobe-3.0 EPSF-3.0
%%Creator: MATLAB, The MathWorks, Inc. Version 8.0.0.783 (R2012b). Operating System: Linux 3.8.0-30-generic #44-Ubuntu SMP Thu Aug 22 20:52:24 UTC 2013 x86_64.
%%Title: figures/BH_scaling.eps
%%CreationDate: 11/01/2013 17:10:24
%%DocumentNeededFonts: Helvetica
%%DocumentProcessColors: Cyan Magenta Yellow Black
%%LanguageLevel: 2
%%Pages: 1
%%BoundingBox: 25 16 788 297
%%EndComments
%%BeginProlog
% MathWorks dictionary
/MathWorks 160 dict begin
% definition operators
/bdef {bind def} bind def
/ldef {load def} bind def
/xdef {exch def} bdef
/xstore {exch store} bdef
% operator abbreviations
/c /clip ldef
/cc /concat ldef
/cp /closepath ldef
/gr /grestore ldef
/gs /gsave ldef
/mt /moveto ldef
/np /newpath ldef
/cm /currentmatrix ldef
/sm /setmatrix ldef
/rm /rmoveto ldef
/rl /rlineto ldef
/s {show newpath} bdef
/sc {setcmykcolor} bdef
/sr /setrgbcolor ldef
/sg /setgray ldef
/w /setlinewidth ldef
/j /setlinejoin ldef
/cap /setlinecap ldef
/rc {rectclip} bdef
/rf {rectfill} bdef
% page state control
/pgsv () def
/bpage {/pgsv save def} bdef
/epage {pgsv restore} bdef
/bplot /gsave ldef
/eplot {stroke grestore} bdef
% orientation switch
/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
% coordinate system mappings
/dpi2point 0 def
% font control
/FontSize 0 def
/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
makefont setfont} bdef
/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
exch dup 3 1 roll findfont dup length dict begin
{ 1 index /FID ne {def}{pop pop} ifelse } forall
/Encoding exch def currentdict end definefont pop} bdef
/isroman {findfont /CharStrings get /Agrave known} bdef
/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
exch FMS} bdef
/csm {1 dpi2point div -1 dpi2point div scale neg translate
dup landscapeMode eq {pop -90 rotate}
{rotateMode eq {90 rotate} if} ifelse} bdef
% line types: solid, dotted, dashed, dotdash
/SO { [] 0 setdash } bdef
/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
/DA { [6 dpi2point mul] 0 setdash } bdef
/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
dpi2point mul] 0 setdash } bdef
% macros for lines and objects
/L {lineto stroke} bdef
/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
/AP {{rlineto} repeat} bdef
/PDlw -1 def
/W {/PDlw currentlinewidth def setlinewidth} def
/PP {closepath eofill} bdef
/DP {closepath stroke} bdef
/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
neg 0 exch rlineto closepath} bdef
/FR {MR stroke} bdef
/PR {MR fill} bdef
/L1i {{currentfile picstr readhexstring pop} image} bdef
/tMatrix matrix def
/MakeOval {newpath tMatrix currentmatrix pop translate scale
0 0 1 0 360 arc tMatrix setmatrix} bdef
/FO {MakeOval stroke} bdef
/PO {MakeOval fill} bdef
/PD {currentlinewidth 2 div 0 360 arc fill
PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
/FA {newpath tMatrix currentmatrix pop translate scale
0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
/FAn {newpath tMatrix currentmatrix pop translate scale
0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
/vradius 0 def /hradius 0 def /lry 0 def
/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
/ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
vradius add translate hradius vradius scale 0 0 1 180 270 arc
tMatrix setmatrix lrx hradius sub uly vradius add translate
hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
lrx hradius sub lry vradius sub translate hradius vradius scale
0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
closepath} bdef
/FRR {MRR stroke } bdef
/PRR {MRR fill } bdef
/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
closepath} bdef
/FlrRR {MlrRR stroke } bdef
/PlrRR {MlrRR fill } bdef
/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
closepath} bdef
/FtbRR {MtbRR stroke } bdef
/PtbRR {MtbRR fill } bdef
/stri 6 array def /dtri 6 array def
/smat 6 array def /dmat 6 array def
/tmat1 6 array def /tmat2 6 array def /dif 3 array def
/asub {/ind2 exch def /ind1 exch def dup dup
ind1 get exch ind2 get sub exch } bdef
/tri_to_matrix {
2 0 asub 3 1 asub 4 0 asub 5 1 asub
dup 0 get exch 1 get 7 -1 roll astore } bdef
/compute_transform {
dmat dtri tri_to_matrix tmat1 invertmatrix
smat stri tri_to_matrix tmat2 concatmatrix } bdef
/ds {stri astore pop} bdef
/dt {dtri astore pop} bdef
/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
currentfile
3 index 0 eq {/ASCIIHexDecode filter}
{/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
ifelse exch readstring pop
dup 0 3 index getinterval /rbmap xdef
dup 2 index dup getinterval /gbmap xdef
1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
/it {gs np dtri aload pop moveto lineto lineto cp c
cols rows 8 compute_transform
rbmap gbmap bbmap true 3 colorimage gr}bdef
/il {newpath moveto lineto stroke}bdef
currentdict end def
%%EndProlog
%%BeginSetup
MathWorks begin
0 cap
end
%%EndSetup
%%Page: 1 1
%%BeginPageSetup
%%PageBoundingBox: 25 16 788 297
MathWorks begin
bpage
%%EndPageSetup
%%BeginObject: obj1
bplot
/dpi2point 12 def
portraitMode 0192 3636 csm
114 65 9156 3375 rc
85 dict begin %Colortable dictionary
/c0 { 0.000000 0.000000 0.000000 sr} bdef
/c1 { 1.000000 1.000000 1.000000 sr} bdef
/c2 { 0.900000 0.000000 0.000000 sr} bdef
/c3 { 0.000000 0.820000 0.000000 sr} bdef
/c4 { 0.000000 0.000000 0.800000 sr} bdef
/c5 { 0.910000 0.820000 0.320000 sr} bdef
/c6 { 1.000000 0.260000 0.820000 sr} bdef
/c7 { 0.000000 0.820000 0.820000 sr} bdef
c0
1 j
1 sg
0 0 9404 3445 rf
6 w
0 2756 2821 0 0 -2756 282 3100 4 MP
PP
-2821 0 0 2756 2821 0 0 -2756 282 3100 5 MP stroke
4 w
DO
SO
6 w
0 sg
282 3100 mt 3103 3100 L
282 344 mt 3103 344 L
282 3100 mt 282 344 L
3103 3100 mt 3103 344 L
282 3100 mt 3103 3100 L
282 3100 mt 282 344 L
685 3100 mt 685 3071 L
685 344 mt 685 372 L
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
619 3245 mt
(10) s
1132 3100 mt 1132 3071 L
1132 344 mt 1132 372 L
1066 3245 mt
(20) s
1580 3100 mt 1580 3071 L
1580 344 mt 1580 372 L
1514 3245 mt
(30) s
2028 3100 mt 2028 3071 L
2028 344 mt 2028 372 L
1962 3245 mt
(40) s
2476 3100 mt 2476 3071 L
2476 344 mt 2476 372 L
2410 3245 mt
(50) s
2923 3100 mt 2923 3071 L
2923 344 mt 2923 372 L
2857 3245 mt
(60) s
282 2706 mt 310 2706 L
3103 2706 mt 3074 2706 L
114 2750 mt
(10) s
282 2268 mt 310 2268 L
3103 2268 mt 3074 2268 L
114 2312 mt
(20) s
282 1831 mt 310 1831 L
3103 1831 mt 3074 1831 L
114 1875 mt
(30) s
282 1393 mt 310 1393 L
3103 1393 mt 3074 1393 L
114 1437 mt
(40) s
282 956 mt 310 956 L
3103 956 mt 3074 956 L
114 1000 mt
(50) s
282 518 mt 310 518 L
3103 518 mt 3074 518 L
114 562 mt
(60) s
282 3100 mt 3103 3100 L
282 344 mt 3103 344 L
282 3100 mt 282 344 L
3103 3100 mt 3103 344 L
gs 282 344 2822 2757 rc
24 w
45 122 45 -42 45 -15 45 46 44 -76 45 108 45 -21 45 -65
44 40 45 -4 45 -41 45 24 45 -10 44 5 45 0 45 -17
45 6 44 -58 45 36 45 -29 45 26 45 -39 44 8 45 -44
45 -12 45 5 44 -34 45 -10 45 -18 45 -12 45 -22 44 -60
45 2 45 -39 45 -18 44 -52 45 -26 45 -38 45 -39 45 -41
44 -37 45 -38 45 -36 45 -40 44 -41 45 -41 45 -42 45 -42
45 -39 44 -40 45 -42 45 -42 45 -43 44 -42 45 -41 45 -41
45 -43 45 -42 44 -43 45 -43 45 -42 45 -43 44 -44 282 3100 64 MP stroke
gr
24 w
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 144 FMSR
2568 1516 mt
(3539ms) s
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
1464 3388 mt
(nr. cores) s
gs 282 344 2822 2757 rc
DO
16.8 w
2821 -2756 282 3100 2 MP stroke
gr
16.8 w
DO
1115 249 mt
(Speedup Barnes-Hut) s
SO
6 w
1 sg
0 2757 5642 0 0 -2757 3573 3100 4 MP
PP
-5642 0 0 2757 5642 0 0 -2757 3573 3100 5 MP stroke
4 w
DO
SO
6 w
0 sg
3573 3100 mt 9215 3100 L
3573 343 mt 9215 343 L
3573 3100 mt 3573 343 L
9215 3100 mt 9215 343 L
3573 3100 mt 9215 3100 L
3573 3100 mt 3573 343 L
4379 3100 mt 4379 3043 L
4379 344 mt 4379 400 L
4313 3245 mt
(10) s
5274 3100 mt 5274 3043 L
5274 344 mt 5274 400 L
5208 3245 mt
(20) s
6170 3100 mt 6170 3043 L
6170 344 mt 6170 400 L
6104 3245 mt
(30) s
7065 3100 mt 7065 3043 L
7065 344 mt 7065 400 L
6999 3245 mt
(40) s
7961 3100 mt 7961 3043 L
7961 344 mt 7961 400 L
7895 3245 mt
(50) s
8856 3100 mt 8856 3043 L
8856 344 mt 8856 400 L
8790 3245 mt
(60) s
3573 3100 mt 3629 3100 L
9215 3100 mt 9158 3100 L
3472 3144 mt
(0) s
3573 2598 mt 3629 2598 L
9215 2598 mt 9158 2598 L
3372 2642 mt
(0.2) s
3573 2097 mt 3629 2097 L
9215 2097 mt 9158 2097 L
3372 2141 mt
(0.4) s
3573 1596 mt 3629 1596 L
9215 1596 mt 9158 1596 L
3372 1640 mt
(0.6) s
3573 1095 mt 3629 1095 L
9215 1095 mt 9158 1095 L
3372 1139 mt
(0.8) s
3573 594 mt 3629 594 L
9215 594 mt 9158 594 L
3472 638 mt
(1) s
3573 3100 mt 9215 3100 L
3573 343 mt 9215 343 L
3573 3100 mt 3573 343 L
9215 3100 mt 9215 343 L
gs 3573 344 5643 2757 rc
24 w
90 132 90 -17 89 9 90 68 89 -48 90 130 89 7 90 -38
89 69 90 26 90 -14 89 58 90 21 89 40 90 36 89 16
90 45 89 -32 90 87 90 4 89 79 90 -6 89 58 90 -12
89 34 90 62 89 3 90 43 90 33 89 43 90 30 89 -37
90 75 89 3 90 43 89 -22 90 30 90 6 89 5 90 2
89 11 90 8 89 15 90 5 89 5 90 2 90 1 89 -2
90 14 89 8 90 3 89 4 90 -3 89 -2 90 11 90 15
89 -1 90 5 89 7 90 -3 89 11 90 9 89 19 3573 594 64 MP stroke
gr
24 w
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 144 FMSR
8909 1650 mt
(52%) s
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
6166 3388 mt
(nr. cores) s
gs 3573 344 5643 2757 rc
DO
16.8 w
5642 0 3573 594 2 MP stroke
gr
16.8 w
DO
5586 249 mt
(Parallel Efficiency Barnes-Hut) s
SO
6 w
end %%Color Dict
eplot
%%EndObject
epage
end
showpage
%%Trailer
%%EOF
File added
File added
This diff is collapsed.
%!PS-Adobe-3.0 EPSF-3.0
%%Creator: MATLAB, The MathWorks, Inc. Version 8.0.0.783 (R2012b). Operating System: Linux 3.8.0-30-generic #44-Ubuntu SMP Thu Aug 22 20:52:24 UTC 2013 x86_64.
%%Title: figures/QR_scaling.eps
%%CreationDate: 10/30/2013 20:36:25
%%DocumentNeededFonts: Helvetica
%%DocumentProcessColors: Cyan Magenta Yellow Black
%%LanguageLevel: 2
%%Pages: 1
%%BoundingBox: 25 16 788 297
%%EndComments
%%BeginProlog
% MathWorks dictionary
/MathWorks 160 dict begin
% definition operators
/bdef {bind def} bind def
/ldef {load def} bind def
/xdef {exch def} bdef
/xstore {exch store} bdef
% operator abbreviations
/c /clip ldef
/cc /concat ldef
/cp /closepath ldef
/gr /grestore ldef
/gs /gsave ldef
/mt /moveto ldef
/np /newpath ldef
/cm /currentmatrix ldef
/sm /setmatrix ldef
/rm /rmoveto ldef
/rl /rlineto ldef
/s {show newpath} bdef
/sc {setcmykcolor} bdef
/sr /setrgbcolor ldef
/sg /setgray ldef
/w /setlinewidth ldef
/j /setlinejoin ldef
/cap /setlinecap ldef
/rc {rectclip} bdef
/rf {rectfill} bdef
% page state control
/pgsv () def
/bpage {/pgsv save def} bdef
/epage {pgsv restore} bdef
/bplot /gsave ldef
/eplot {stroke grestore} bdef
% orientation switch
/portraitMode 0 def /landscapeMode 1 def /rotateMode 2 def
% coordinate system mappings
/dpi2point 0 def
% font control
/FontSize 0 def
/FMS {/FontSize xstore findfont [FontSize 0 0 FontSize neg 0 0]
makefont setfont} bdef
/reencode {exch dup where {pop load} {pop StandardEncoding} ifelse
exch dup 3 1 roll findfont dup length dict begin
{ 1 index /FID ne {def}{pop pop} ifelse } forall
/Encoding exch def currentdict end definefont pop} bdef
/isroman {findfont /CharStrings get /Agrave known} bdef
/FMSR {3 1 roll 1 index dup isroman {reencode} {pop pop} ifelse
exch FMS} bdef
/csm {1 dpi2point div -1 dpi2point div scale neg translate
dup landscapeMode eq {pop -90 rotate}
{rotateMode eq {90 rotate} if} ifelse} bdef
% line types: solid, dotted, dashed, dotdash
/SO { [] 0 setdash } bdef
/DO { [.5 dpi2point mul 4 dpi2point mul] 0 setdash } bdef
/DA { [6 dpi2point mul] 0 setdash } bdef
/DD { [.5 dpi2point mul 4 dpi2point mul 6 dpi2point mul 4
dpi2point mul] 0 setdash } bdef
% macros for lines and objects
/L {lineto stroke} bdef
/MP {3 1 roll moveto 1 sub {rlineto} repeat} bdef
/AP {{rlineto} repeat} bdef
/PDlw -1 def
/W {/PDlw currentlinewidth def setlinewidth} def
/PP {closepath eofill} bdef
/DP {closepath stroke} bdef
/MR {4 -2 roll moveto dup 0 exch rlineto exch 0 rlineto
neg 0 exch rlineto closepath} bdef
/FR {MR stroke} bdef
/PR {MR fill} bdef
/L1i {{currentfile picstr readhexstring pop} image} bdef
/tMatrix matrix def
/MakeOval {newpath tMatrix currentmatrix pop translate scale
0 0 1 0 360 arc tMatrix setmatrix} bdef
/FO {MakeOval stroke} bdef
/PO {MakeOval fill} bdef
/PD {currentlinewidth 2 div 0 360 arc fill
PDlw -1 eq not {PDlw w /PDlw -1 def} if} def
/FA {newpath tMatrix currentmatrix pop translate scale
0 0 1 5 -2 roll arc tMatrix setmatrix stroke} bdef
/PA {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
0 0 1 5 -2 roll arc closepath tMatrix setmatrix fill} bdef
/FAn {newpath tMatrix currentmatrix pop translate scale
0 0 1 5 -2 roll arcn tMatrix setmatrix stroke} bdef
/PAn {newpath tMatrix currentmatrix pop translate 0 0 moveto scale
0 0 1 5 -2 roll arcn closepath tMatrix setmatrix fill} bdef
/vradius 0 def /hradius 0 def /lry 0 def
/lrx 0 def /uly 0 def /ulx 0 def /rad 0 def
/MRR {/vradius xdef /hradius xdef /lry xdef /lrx xdef /uly xdef
/ulx xdef newpath tMatrix currentmatrix pop ulx hradius add uly
vradius add translate hradius vradius scale 0 0 1 180 270 arc
tMatrix setmatrix lrx hradius sub uly vradius add translate
hradius vradius scale 0 0 1 270 360 arc tMatrix setmatrix
lrx hradius sub lry vradius sub translate hradius vradius scale
0 0 1 0 90 arc tMatrix setmatrix ulx hradius add lry vradius sub
translate hradius vradius scale 0 0 1 90 180 arc tMatrix setmatrix
closepath} bdef
/FRR {MRR stroke } bdef
/PRR {MRR fill } bdef
/MlrRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lry uly sub 2 div def
newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
rad rad scale 0 0 1 90 270 arc tMatrix setmatrix lrx rad sub lry rad
sub translate rad rad scale 0 0 1 270 90 arc tMatrix setmatrix
closepath} bdef
/FlrRR {MlrRR stroke } bdef
/PlrRR {MlrRR fill } bdef
/MtbRR {/lry xdef /lrx xdef /uly xdef /ulx xdef /rad lrx ulx sub 2 div def
newpath tMatrix currentmatrix pop ulx rad add uly rad add translate
rad rad scale 0 0 1 180 360 arc tMatrix setmatrix lrx rad sub lry rad
sub translate rad rad scale 0 0 1 0 180 arc tMatrix setmatrix
closepath} bdef
/FtbRR {MtbRR stroke } bdef
/PtbRR {MtbRR fill } bdef
/stri 6 array def /dtri 6 array def
/smat 6 array def /dmat 6 array def
/tmat1 6 array def /tmat2 6 array def /dif 3 array def
/asub {/ind2 exch def /ind1 exch def dup dup
ind1 get exch ind2 get sub exch } bdef
/tri_to_matrix {
2 0 asub 3 1 asub 4 0 asub 5 1 asub
dup 0 get exch 1 get 7 -1 roll astore } bdef
/compute_transform {
dmat dtri tri_to_matrix tmat1 invertmatrix
smat stri tri_to_matrix tmat2 concatmatrix } bdef
/ds {stri astore pop} bdef
/dt {dtri astore pop} bdef
/db {2 copy /cols xdef /rows xdef mul dup 3 mul string
currentfile
3 index 0 eq {/ASCIIHexDecode filter}
{/ASCII85Decode filter 3 index 2 eq {/RunLengthDecode filter} if }
ifelse exch readstring pop
dup 0 3 index getinterval /rbmap xdef
dup 2 index dup getinterval /gbmap xdef
1 index dup 2 mul exch getinterval /bbmap xdef pop pop}bdef
/it {gs np dtri aload pop moveto lineto lineto cp c
cols rows 8 compute_transform
rbmap gbmap bbmap true 3 colorimage gr}bdef
/il {newpath moveto lineto stroke}bdef
currentdict end def
%%EndProlog
%%BeginSetup
MathWorks begin
0 cap
end
%%EndSetup
%%Page: 1 1
%%BeginPageSetup
%%PageBoundingBox: 25 16 788 297
MathWorks begin
bpage
%%EndPageSetup
%%BeginObject: obj1
bplot
/dpi2point 12 def
portraitMode 0192 3636 csm
114 65 9156 3375 rc
85 dict begin %Colortable dictionary
/c0 { 0.000000 0.000000 0.000000 sr} bdef
/c1 { 1.000000 1.000000 1.000000 sr} bdef
/c2 { 0.900000 0.000000 0.000000 sr} bdef
/c3 { 0.000000 0.820000 0.000000 sr} bdef
/c4 { 0.000000 0.000000 0.800000 sr} bdef
/c5 { 0.910000 0.820000 0.320000 sr} bdef
/c6 { 1.000000 0.260000 0.820000 sr} bdef
/c7 { 0.000000 0.820000 0.820000 sr} bdef
c0
1 j
1 sg
0 0 9404 3445 rf
6 w
0 2756 2821 0 0 -2756 282 3100 4 MP
PP
-2821 0 0 2756 2821 0 0 -2756 282 3100 5 MP stroke
4 w
DO
SO
6 w
0 sg
282 3100 mt 3103 3100 L
282 344 mt 3103 344 L
282 3100 mt 282 344 L
3103 3100 mt 3103 344 L
282 3100 mt 3103 3100 L
282 3100 mt 282 344 L
685 3100 mt 685 3071 L
685 344 mt 685 372 L
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
619 3245 mt
(10) s
1132 3100 mt 1132 3071 L
1132 344 mt 1132 372 L
1066 3245 mt
(20) s
1580 3100 mt 1580 3071 L
1580 344 mt 1580 372 L
1514 3245 mt
(30) s
2028 3100 mt 2028 3071 L
2028 344 mt 2028 372 L
1962 3245 mt
(40) s
2476 3100 mt 2476 3071 L
2476 344 mt 2476 372 L
2410 3245 mt
(50) s
2923 3100 mt 2923 3071 L
2923 344 mt 2923 372 L
2857 3245 mt
(60) s
282 3100 mt 310 3100 L
3103 3100 mt 3074 3100 L
181 3144 mt
(0) s
282 2669 mt 310 2669 L
3103 2669 mt 3074 2669 L
114 2713 mt
(10) s
282 2238 mt 310 2238 L
3103 2238 mt 3074 2238 L
114 2282 mt
(20) s
282 1808 mt 310 1808 L
3103 1808 mt 3074 1808 L
114 1852 mt
(30) s
282 1377 mt 310 1377 L
3103 1377 mt 3074 1377 L
114 1421 mt
(40) s
282 946 mt 310 946 L
3103 946 mt 3074 946 L
114 990 mt
(50) s
282 516 mt 310 516 L
3103 516 mt 3074 516 L
114 560 mt
(60) s
282 3100 mt 3103 3100 L
282 344 mt 3103 344 L
282 3100 mt 282 344 L
3103 3100 mt 3103 344 L
gs 282 344 2822 2757 rc
24 w
45 10 45 -16 45 -36 45 -14 44 -36 45 -2 45 2 45 -8
44 -79 45 21 45 -53 45 -20 45 -20 44 -32 45 -2 45 -23
45 -18 44 -26 45 -33 45 -16 45 -35 45 -29 44 -34 45 -15
45 -24 45 -27 44 -52 45 -27 45 -24 45 -49 45 -17 44 -27
45 -6 45 -52 45 -43 44 -37 45 -43 45 -4 45 -46 45 -34
44 -52 45 -37 45 -42 45 -37 44 -20 45 -53 45 -37 45 -28
45 -51 44 -38 45 -33 45 -40 45 -40 44 -51 45 -42 45 -40
45 -39 45 -43 44 -37 45 -44 45 -41 45 -43 44 -42 282 3056 64 MP stroke
gr
24 w
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 144 FMSR
2648 1059 mt
(233ms) s
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
1464 3388 mt
(nr. cores) s
gs 282 344 2822 2757 rc
DO
16.8 w
2821 -2712 282 3056 2 MP stroke
gr
16.8 w
DO
942 249 mt
(Speedup QR decomposition) s
SO
6 w
1 sg
0 2757 5642 0 0 -2757 3573 3100 4 MP
PP
-5642 0 0 2757 5642 0 0 -2757 3573 3100 5 MP stroke
4 w
DO
SO
6 w
0 sg
3573 3100 mt 9215 3100 L
3573 343 mt 9215 343 L
3573 3100 mt 3573 343 L
9215 3100 mt 9215 343 L
3573 3100 mt 9215 3100 L
3573 3100 mt 3573 343 L
4379 3100 mt 4379 3043 L
4379 344 mt 4379 400 L
4313 3245 mt
(10) s
5274 3100 mt 5274 3043 L
5274 344 mt 5274 400 L
5208 3245 mt
(20) s
6170 3100 mt 6170 3043 L
6170 344 mt 6170 400 L
6104 3245 mt
(30) s
7065 3100 mt 7065 3043 L
7065 344 mt 7065 400 L
6999 3245 mt
(40) s
7961 3100 mt 7961 3043 L
7961 344 mt 7961 400 L
7895 3245 mt
(50) s
8856 3100 mt 8856 3043 L
8856 344 mt 8856 400 L
8790 3245 mt
(60) s
3573 3100 mt 3629 3100 L
9215 3100 mt 9158 3100 L
3472 3144 mt
(0) s
3573 2598 mt 3629 2598 L
9215 2598 mt 9158 2598 L
3372 2642 mt
(0.2) s
3573 2097 mt 3629 2097 L
9215 2097 mt 9158 2097 L
3372 2141 mt
(0.4) s
3573 1596 mt 3629 1596 L
9215 1596 mt 9158 1596 L
3372 1640 mt
(0.6) s
3573 1095 mt 3629 1095 L
9215 1095 mt 9158 1095 L
3372 1139 mt
(0.8) s
3573 594 mt 3629 594 L
9215 594 mt 9158 594 L
3472 638 mt
(1) s
3573 3100 mt 9215 3100 L
3573 343 mt 9215 343 L
3573 3100 mt 3573 343 L
9215 3100 mt 9215 343 L
gs 3573 344 5643 2757 rc
24 w
90 38 90 14 89 -3 90 17 89 -3 90 30 89 36 90 26
89 -48 90 59 90 -21 89 16 90 15 89 3 90 39 89 14
90 22 89 11 90 4 90 26 89 2 90 10 89 4 90 32
89 18 90 18 89 -24 90 17 90 23 89 -20 90 38 89 20
90 62 89 -25 90 -7 89 6 90 -9 90 78 89 -12 90 16
89 -30 90 9 89 -5 90 10 89 61 90 -38 90 13 89 45
90 -34 89 13 90 35 89 11 90 11 89 -52 90 0 90 8
89 25 90 -12 89 55 90 -19 89 33 90 -20 89 26 3573 594 64 MP stroke
gr
24 w
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 144 FMSR
8909 1149 mt
(73%) s
%%IncludeResource: font Helvetica
/Helvetica /ISOLatin1Encoding 120 FMSR
6166 3388 mt
(nr. cores) s
gs 3573 344 5643 2757 rc
DO
16.8 w
5642 0 3573 594 2 MP stroke
gr
16.8 w
DO
5412 249 mt
(Parallel Efficiency QR decomposition) s
SO
6 w
end %%Color Dict
eplot
%%EndObject
epage
end
showpage
%%Trailer
%%EOF
File added
...@@ -475,6 +475,8 @@ that are themselves subsets of larger resources. ...@@ -475,6 +475,8 @@ that are themselves subsets of larger resources.
This can be useful, e.g.~in the context of particle simulations This can be useful, e.g.~in the context of particle simulations
described in the next section, where particles are sorted described in the next section, where particles are sorted
into hierarchical cells which are used at different levels. into hierarchical cells which are used at different levels.
The owner field is the ID of the queue to which this
resource has been preferentially assigned.
The {\tt lock} field is either {\tt 0} or {\tt 1} and indicates The {\tt lock} field is either {\tt 0} or {\tt 1} and indicates
whether this resource is currently in use, i.e.~{\em locked}. whether this resource is currently in use, i.e.~{\em locked}.
...@@ -798,7 +800,7 @@ struct task *qsched_gettask ( qsched *s , int qid ) { ...@@ -798,7 +800,7 @@ struct task *qsched_gettask ( qsched *s , int qid ) {
else else
break; break;
} }
if ( res != NULL ) { if ( res != NULL && s->reown ) {
for ( k = 0 ; k < res->nr_locks ; k++ ) for ( k = 0 ; k < res->nr_locks ; k++ )
res->locks[k]->owner = qid; res->locks[k]->owner = qid;
for ( k = 0 ; k < res->nr_uses ; k++ ) for ( k = 0 ; k < res->nr_uses ; k++ )
...@@ -815,7 +817,8 @@ unresolved conflicts, the scheduler uses {\em work stealing} ...@@ -815,7 +817,8 @@ unresolved conflicts, the scheduler uses {\em work stealing}
\cite{ref:Blumofe1999}, i.e.~it loops over all other queues \cite{ref:Blumofe1999}, i.e.~it loops over all other queues
in a random order (line~6) and tries to get a task from them in a random order (line~6) and tries to get a task from them
(line~7). (line~7).
If a task could be obtained from any queue (line~13), If a task could be obtained from any queue and task re-owning
is switched on (line~13),
the resources it locks and uses are marked as now being owned the resources it locks and uses are marked as now being owned
by the prefered queue (lines~14--17). by the prefered queue (lines~14--17).
Finally, the task, or {\tt NULL} if no task could be obtained, Finally, the task, or {\tt NULL} if no task could be obtained,
...@@ -894,9 +897,11 @@ The main functions for setting up the scheduler are: ...@@ -894,9 +897,11 @@ The main functions for setting up the scheduler are:
e.g. they are used only to group or otherwise dependencies, and e.g. they are used only to group or otherwise dependencies, and
are not passed to the execution function in {\tt qsched\_run}. are not passed to the execution function in {\tt qsched\_run}.
\vspace{1mm} \vspace{1mm}
\item {\tt qsched\_res\_t qsched\_addres( struct qsched *s , qsched\_res\_t parent )} \\ \item {\tt qsched\_res\_t qsched\_addres( struct qsched *s , int owner , qsched\_res\_t parent )} \\
Creates a new resource within the given {\tt qsched} and returns Creates a new resource within the given {\tt qsched} and returns
its handle. its handle.
The owner field is the initial queue ID to which this resource
should be assigned, or {\tt qsched\_owner\_none}.
The {\tt parent} field is the handle of the heirarchical parent of The {\tt parent} field is the handle of the heirarchical parent of
the new resource or {\tt qsched\_res\_none} if the resource the new resource or {\tt qsched\_res\_none} if the resource
has no hierarchical parent. has no hierarchical parent.
...@@ -959,8 +964,22 @@ This section presents two test cases showing both ...@@ -959,8 +964,22 @@ This section presents two test cases showing both
how QuickSched can be used in real-world applications, and how QuickSched can be used in real-world applications, and
providing benchmarks to assess its efficiency and scalability. providing benchmarks to assess its efficiency and scalability.
The first test is the tiled QR decomposition originally
from \citeN{ref:Buttari2009}, which has been used as a benchmark
by other authors \cite{ref:Agullo2009b,ref:Badia2009,ref:Bosilca2012}.
This example only requires dependencies and is presented
only as a point of comparison to existing task-based
parallel infrastructures.
The second example is a Barnes-Hut tree-code, a problem
similar to the Fast Multipole Method described in both
\citeN{ref:Ltaief2012} and \citeN{ref:Agullo2013}.
This example shows how conflicts, modeled
via hierarchical resources, can be useful.
The source code of both examples is distributed with the The source code of both examples is distributed with the
QuickSched library. QuickSched library, along with scripts to run the benchmarks
and generate the plots used in the following.
\subsection{Task-Based QR Decomposition} \subsection{Task-Based QR Decomposition}
...@@ -1135,11 +1154,383 @@ void exec_fun ( int type , void *data ) { ...@@ -1135,11 +1154,383 @@ void exec_fun ( int type , void *data ) {
\noindent where {\tt A} is the matrix over which the QR \noindent where {\tt A} is the matrix over which the QR
decmposition is executed. decmposition is executed.
The QR decomposition was computed for a $2048\times 2048$
random matrix using tiles of size $64\times 64$ using QuickSched
as described above.
For the DGEQRF and DLARFT kernels, the respective routines
from the LAPACKE\footnote{\url{http://www.netlib.org/lapack/lapacke.html}}
library were used.
The DTSQRF and DSSRFT kernels were implemented by Sam Townsend
as part of his MSc thesis in Computer Science at Durham University.
For this matrix, a total of 11440 tasks with 32240 dependencies
were generated.
All tests were run on a 64-core AMD Opteron 6376 machine running
at 2.6\,GHz.
For these tests, OpenMP parallelism and resource re-owning
were used.
The QR decomposition was computed 10 times for each number of
cores, and the average thereof taken for the scaling and
efficiency results in \fig{QRResults}.
The timings are for {\tt qsched\_run}, including the cost of
{\tt qsched\_start}, which does not run in parallel.
Setting up the scheduler, tasks, and resources took, in all
cases, an average of 7.2\,ms.
\begin{figure}
\centerline{\epsfig{file=figures/QR_scaling.pdf,width=0.9\textwidth}}
\caption{Strong scaling and parallel efficiency of the tiled QR decomposition
computed over a $2048\times 2048$ matrix with tiles of size
$64\times 64$.
The QR decomposition takes 233\,ms, achieving 73\% parallel
efficiency, over all 64 cores.
}
\label{fig:QRResults}
\end{figure}
\subsection{Task-Based Barnes-Hut N-Body Solver} \subsection{Task-Based Barnes-Hut N-Body Solver}
The Barnes-Hut tree-code is an algorithm to compute the The Barnes-Hut tree-code is an algorithm to approximate the
interactions between a set of particles solution of an $N$-body problem, i.e.~computing all the
pairwise interactions between a set of $N$ particles,
in \oh{N\log N} operations, as opposed to the \oh{N^2}
naive direct computation.
The algorithm is based on a recursive octree decomposition:
Starting from a cubic cell containing all the particles,
the cell is recursivel bisected along all three spatial dimensions,
resulting in eight sub-cells, until the number of particles
per cell is smaller than some limit $n_\mathsf{max}$.
The particle interactions can also be formulated recursively:
Given a particle an a cell of particles, if the particle and cell
are sufficiently well separated, the particle-cell interactions
are approximated by interacting the particle with the cell's
centre of mass.
If the particle and the cell are too close, and the cell
has sub-cells, i.e.~it contained more than $n_\mathsf{max}$
particles and was split in the recursive octree decomposition,
then the particle is interacted with each of the sub-cells
recursively.
Finally, if the cell is not split, i.e.~it is a leaf cell
in the octree, then the particle is interacted with all
particles in the cell, except for the particle itself if
it is in the same cell.
This operation is performed for each particle, starting
with the root-level cell containing all the particles.
The cells have the following structure:
\begin{center}\begin{minipage}{0.9\textwidth}
\begin{lstlisting}
struct cell {
double loc[3], h[3], com[3], mass;
int split, count;
struct part *parts;
struct cell *progeny[8];
qsched_res_t res;
qsched_task_t task_com;
};
\end{lstlisting}
\end{minipage}\end{center}
\noindent where {\tt loc} and {\tt h} are the location
and size of the cell, respectively.
The {\tt com} and {\tt mass} fields represent the cell's
center of mass, which will be used in the interactions.
The {\tt res} filed is the hieararchical resource representing
the cell's particles, and it is the parent resource of the cell
progeny's {\tt res}.
Similarly, the {\tt task\_com} is a task handle to
compute the center of mass of the cell's particles, and
it depends on the {\tt task\_com} of all the progeny if
the cell is split.
{\tt parts} is a pointer to an array of {\tt count}
particle strutures, which contain all the particle
data of the form:
\begin{center}\begin{minipage}{0.9\textwidth}
\begin{lstlisting}
struct part {
double x[3], a[3], mass;
int id;
};
\end{lstlisting}
\end{minipage}\end{center}
\noindent i.e.~the particle position, accelleration, mass,
and ID, respectively.
The particle data is sorted hierarchically, following the
octree structure.
Unlike in many codes, where the leaves store an array of
pointers to the underlying particles, the cells, at all
levels, store only a pointer to the first of their own particles,
and the total number of particles.
The current approach, illustrated in \fig{CellParts} is not
only more compact, it also allows a direct and more cache-efficient access
to the list of particles for any inner node of the tree.
The cost of sorting the particles, with a recursive
partitioning similar to QuickSort \cite{ref:Hoare1962},
is in \oh{N\log N}.
\begin{figure}
\centerline{\epsfig{file=figures/CellParts.pdf,width=0.9\textwidth}}
\caption{Hierarchical ordering of the particle data structures
(right) according to their cell (left).
Each cell has a pointer to the first of its particles (same colour
as cells) in the same global parts array.}
\label{fig:CellParts}
\end{figure}
The task-based implementation will consist of four
types of tasks:
\begin{itemize}
\item {\em Self}-interactions in which all particles
in a single cell interact with all other particles in the
same cell,
\item {\em Particle-particle pair}-interactions in which
all particles in a pair of cells interact with all
particles in the opposite cell, and
\item {\em Particle-cell pair}-interactions in which
all particles in one cell are interacted with the
center of mass of another cell.
\item {\em Center of mass} tasks, which compute
the center of mass of a single cell either from
the sum of the centers of mass of its sub-cells
if it has been split, or directly from the
particles otherwise.
\end{itemize}
\begin{figure}
\begin{center}\begin{minipage}{0.9\textwidth}
\begin{lstlisting}
enum { tSELF , tPAIR_PP , tPAIR_PC , tCOM };
void make_tasks ( struct qsched *s , struct cell *ci , struct cell *cj ) {
int j, k;
qsched_task_t tid;
struct cell *data[2];
if ( cj == NULL ) {
if ( ci->split && ci->count > n_task )
for ( j = 0 ; j < 8 ; j++ ) {
make_tasks( s , ci->progeny[j] , NULL );
for ( k = j+1 ; k < 8 ; k++ )
make_tasks( s , ci->progeny[j] , ci->progeny[k] );
}
else {
tid = qsched_addtask( s , tSELF , qsched_flags_none ,
&ci , sizeof(struct cell *) , ci->count*ci->count );
qsched_addlock( s , tid , ci->res );
if ( ci->split )
qsched_addunlock( s , ci->com , tid );
}
}
else if ( ci and cj are well separated ) {
data[0] = ci; data[1] = cj;
tid = qsched_addtask( s , tPAIR_PC , qsched_flags_none ,
data , sizeof(struct cell *)*2 , ci->count );
qsched_addlock( s , tid , ci->res );
qsched_addunlock( s , cj->com , tid );
data[0] = cj; data[1] = ci;
tid = qsched_addtask( s , tPAIR_PC , qsched_flags_none ,
data , sizeof(struct cell *)*2 , cj->count );
qsched_addlock( s , tid , cj->res );
qsched_addunlock( s , ci->com , tid );
}
else if ( ci->split && cj->split &&
ci->count*cj->count > n_task*n_task )
for ( j = 0 ; j < 8 ; j++ )
for ( k = 0 ; k < 8 ; k++ )
make_tasks( s , ci->progeny[j] , cj->progeny[k] );
else {
data[0] = ci; data[1] = cj;
tid = qsched_addtask( s , tPAIR_PP , qsched_flags_none ,
data , sizeof(struct cell *)*2 , ci->count*cj->count );
qsched_addlock( s , tid , ci->res );
qsched_addlock( s , tid , cj->res );
if ( ci->split && cj->split ) {
qsched_addunlock( s , ci->com , tid );
qsched_addunlock( s , cj->com , tid );
}
}
}
\end{lstlisting}
\end{minipage}\end{center}
\caption{C-like pseudo-code for recursive task creation
for the Barnes-Hut tree-code.}
\label{fig:MakeTasks}
\end{figure}
These tasks can be created recursively over the cell hierarchy
as shown in the function {\tt make\_tasks} in \fig{MakeTasks}.
The function is called on the root cell with the root cell
and {\tt NULL} as its two cell parameters.
The function recurses as follows:
\begin{itemize}
\item If called with a single (line~6), split (line~7) cell,
recurse over all the cell's sub cells (line~9), and all
pairs of the cell's sub cells (line~11),
\item If called with a single unslplit cell (line~13),
create a self-interaction task on that cell (line~14),
\item If called with two cells that are sufficiently well
separated (line~21), create two particle-cell pair
interactions (lines~23 and~28) over both cells in
opposite orders, which depend on the center of mass
task of each cell,
\item If called with two cells that are not well
separated and both cells are split (line~33),
recurse over all pairs of sub-cells spanning
both cells (line~37), and
\item If called with two cells that are not well separated
and either of the cells are not split, create
a particle-particle pair task over both cells.
\end{itemize}
\noindent where every interaction task additionally locks
the cells on which it operates (lines~16, 25, 30, and 42--43).
In order to reduce the number of tasks, and to prevent generating
too many very small tasks, the task generation only recurses
if the cells contain more than a minimum number $n_\mathsf{task}$
of threads each (lines~7 and~34).
The tasks themselves are then left to recurse over the sub-trees,
which is why in these cases, the tasks are made to depend on the
center of mass tasks (lines~17--18 and~41--47)
which may be used in the ensuing interactions.
\begin{figure}
\begin{center}\begin{minipage}{0.9\textwidth}
\begin{lstlisting}
void comp_com ( struct cell *c ) {
int j, k;
c->com[0] = 0.0; c->com[1] = 0.0; c->com[2] = 0.0;
c->mas = 0.0;
if ( c->split )
for ( k = 0 ; k < 8 ; k++ ) {
struct cell *cp = c->progeny[k]
for ( j = 0 ; j < 3 ; j++ ) c->com[j] += cp->com[j]*cp->mass;
c->mass += cp->mass;
}
else
for ( k = 0 ; k < 8 ; k++ ) {
struct part *p = &c->parts[k];
for ( j = 0 ; j < 3 ; j++ ) c->com[j] += p->x[j]*p->mass;
c->mass += p->mass;
}
c->com[0] /= c->mass; c->com[1] /= c->mass; c->com[2] /= c->mass;
}
void comp_self ( struct cell *c ) {
int j, k;
if ( c->split )
for ( j = 0 ; j < 8 ; j++ ) {
comp_self( c->progeny[j] );
for ( k = j+1 ; k < 8 ; k++ )
comp_pair( c->progeny[j] , c->progeny[k] );
}
else
for ( j = 0 ; j < c->count ; j++ )
for ( k = j+1 ; k < c->count ; k++ )
interact c->parts[j] and c->parts[k].
}
void comp_pair ( struct cell *ci , struct cell *cj ) {
int j, k;
if ( ci and cj well separated ) {
comp_pair_pc( ci , cj );
comp_pair_pc( cj , ci );
}
else if ( ci->split && cj->split )
for ( j = 0 ; j < 8 ; j++ )
for ( k = 0 ; k < 8 ; k++ )
comp_pair( ci->progeny[j] , cj->progeny[k] );
else
for ( j = 0 ; j < ci->count ; j++ )
for ( k = 0 ; k < cj->count ; k++ )
interact ci->parts[j] and cj->parts[k].
}
void comp_pair_cp ( struct cell *ci , struct cell *cj ) {
int k;
for ( k = 0 ; k < ci->count ; k++ )
interact ci->parts[k] and cj center of mass.
}
\end{lstlisting}
\end{minipage}\end{center}
\caption{Task functions for the Barnes-Hut tree-code.}
\label{fig:BHTasks}
\end{figure}
The functions for the task themselves are relatively
straight-forward and shown in \fig{BHTasks}, and the
execution function can be written as:
\begin{center}\begin{minipage}{0.9\textwidth}
\begin{lstlisting}
void exec_fun ( int type , void *data ) {
struct cell **cells = (struct cell **)data;
switch ( type ) {
case tSELF:
comp_self( cells[0] );
break;
case tPAIR_PP:
comp_pair( cells[0] , cells[1] );
break;
case tPAIR_PC:
comp_pair_pc( cells[0] , cells[1] );
break;
case tCOM:
comp_com( cells[0] );
break;
default:
error( "Unknown task type." );
}
}
\end{lstlisting}
\end{minipage}\end{center}
This Barnes-Hut tree-code was used to approximate the gravitational
N-Body problem for 1\,000\,000 particles with random coordinates the
parameters $n_\mathsf{max}=100$ and $n_\mathsf{task}=5000$.
Cell pairs were considered well separated if not adjecent.
Using the above scheme generated 161\,613 tasks, of which
512 self-interaction tasks, 18\,532 particle-particle interaction
task, 105\,120 particle-cell interaction tasks, and 37\,449
center of mass tasks.
Additionally 179\,632 dependencies were generated, along with
142'696 locks on 37\,449 resources.
As with the previous example, all
tests were run on a 64-core AMD Opteron 6376 machine running
at 2.6\,GHz.
For these tests, OpenMP parallelism was used and resource
re-owning was switched off.
The interactions computed 10 times for each number of
cores, and the average thereof taken for the scaling and
efficiency results in \fig{BHResults}.
The timings are for {\tt qsched\_run}, including the cost of
{\tt qsched\_start}, which does not run in parallel.
Setting up the scheduler, tasks, and resources took, in all
cases, an average of 51.3\,ms.
\begin{figure}
\centerline{\epsfig{file=figures/BH_scaling.pdf,width=0.9\textwidth}}
\caption{Strong scaling and parallel efficiency of the Barnes-Hut tree-code
computed over 1\,000\,000 particles.
Solving the N-Body problem takes 3.5\,s, achieving 52\% parallel
efficiency, over all 64 cores.
}
\label{fig:BHResults}
\end{figure}
Unlike the QR decomposition, the results scale well only to
32 cores, achieving 90\% parallel efficiency, and then
level off for increasing numbers of cores.
This, however, is not a problem of the task-based parallel
algorith, or of QuickSched, but of the cache hierarchies
of the underlying hardware.
On the AMD Opteron 6376, the cores are grouped into pairs
which each share a 2\,MB L2 cache.
Each group of four pairs, or eight cores, shares a common
6\,MB L3 cache.
It is the difference between the sum of the L2 caches and
the (smaller) L3 cache which causes problems.
\section{Conclusions} \section{Conclusions}
......
...@@ -169,3 +169,45 @@ ...@@ -169,3 +169,45 @@
year={2009}, year={2009},
organization={IOP Publishing} organization={IOP Publishing}
} }
@article{ref:Hoare1962,
title={Quicksort},
author={Hoare, Charles AR},
journal={The Computer Journal},
volume={5},
number={1},
pages={10--16},
year={1962},
publisher={Br Computer Soc}
}
@article{ref:Bosilca2012,
title={{DAGuE}: A generic distributed DAG engine for high performance computing},
author={Bosilca, George and Bouteiller, Aurelien and Danalis, Anthony and Herault, Thomas and Lemarinier, Pierre and Dongarra, Jack},
journal={Parallel Computing},
volume={38},
number={1},
pages={37--51},
year={2012},
publisher={Elsevier}
}
@article{ref:Badia2009,
title={Parallelizing dense and banded linear algebra libraries using {SMPSs}},
author={Badia, Rosa M and Herrero, Jos{\'e} R and Labarta, Jes{\'u}s and P{\'e}rez, Josep M and Quintana-Ort{\'\i}, Enrique S and Quintana-Ort{\'\i}, Gregorio},
journal={Concurrency and Computation: Practice and Experience},
volume={21},
number={18},
pages={2438--2456},
year={2009},
publisher={Wiley Online Library}
}
@inproceedings{ref:Agullo2009b,
title={Comparative study of one-sided factorizations with multiple software packages on multi-core hardware},
author={Agullo, Emmanuel and Hadri, Bilel and Ltaief, Hatem and Dongarrra, Jack},
booktitle={Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis},
pages={20},
year={2009},
organization={ACM}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment