Skip to content
Snippets Groups Projects
Commit 1164eeb8 authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

major extensions of the shceduler's capabilities.

parent 766668f3
No related branches found
No related tags found
No related merge requests found
...@@ -20,16 +20,26 @@ ...@@ -20,16 +20,26 @@
AUTOMAKE_OPTIONS=gnu AUTOMAKE_OPTIONS=gnu
# Add the source directory and debug to CFLAGS # Add the source directory and debug to CFLAGS
AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=2.67e9 AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=2.67e9 \
-fsanitize=address -fno-omit-frame-pointer
AM_LDFLAGS = -lm AM_LDFLAGS = -lm -fsanitize=address
# Set-up the library # Set-up the library
bin_PROGRAMS = test bin_PROGRAMS = test test_qr test_bh
# Sources for test # Sources for test
test_SOURCES = test.c test_SOURCES = test.c
test_CFLAGS = $(AM_CFLAGS) test_CFLAGS = $(AM_CFLAGS)
test_LDADD = ../src/.libs/libquicksched.a test_LDADD = ../src/.libs/libquicksched.a
# Sources for test_qr
test_qr_SOURCES = test_qr.c
test_qr_CFLAGS = $(AM_CFLAGS)
test_qr_LDADD = ../src/.libs/libquicksched.a -lblas
# Sources for test_bh
test_bh_SOURCES = test_bh.c
test_bh_CFLAGS = $(AM_CFLAGS)
test_bh_LDADD = ../src/.libs/libquicksched.a
...@@ -68,7 +68,7 @@ PRE_UNINSTALL = : ...@@ -68,7 +68,7 @@ PRE_UNINSTALL = :
POST_UNINSTALL = : POST_UNINSTALL = :
build_triplet = @build@ build_triplet = @build@
host_triplet = @host@ host_triplet = @host@
bin_PROGRAMS = test$(EXEEXT) bin_PROGRAMS = test$(EXEEXT) test_qr$(EXEEXT) test_bh$(EXEEXT)
subdir = examples subdir = examples
DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
...@@ -98,6 +98,18 @@ test_DEPENDENCIES = ../src/.libs/libquicksched.a ...@@ -98,6 +98,18 @@ test_DEPENDENCIES = ../src/.libs/libquicksched.a
test_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ test_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=link $(CCLD) $(test_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ --mode=link $(CCLD) $(test_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
$(LDFLAGS) -o $@ $(LDFLAGS) -o $@
am_test_bh_OBJECTS = test_bh-test_bh.$(OBJEXT)
test_bh_OBJECTS = $(am_test_bh_OBJECTS)
test_bh_DEPENDENCIES = ../src/.libs/libquicksched.a
test_bh_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=link $(CCLD) $(test_bh_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
$(LDFLAGS) -o $@
am_test_qr_OBJECTS = test_qr-test_qr.$(OBJEXT)
test_qr_OBJECTS = $(am_test_qr_OBJECTS)
test_qr_DEPENDENCIES = ../src/.libs/libquicksched.a
test_qr_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=link $(CCLD) $(test_qr_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
$(LDFLAGS) -o $@
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp depcomp = $(SHELL) $(top_srcdir)/depcomp
am__depfiles_maybe = depfiles am__depfiles_maybe = depfiles
...@@ -111,8 +123,8 @@ CCLD = $(CC) ...@@ -111,8 +123,8 @@ CCLD = $(CC)
LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
$(LDFLAGS) -o $@ $(LDFLAGS) -o $@
SOURCES = $(test_SOURCES) SOURCES = $(test_SOURCES) $(test_bh_SOURCES) $(test_qr_SOURCES)
DIST_SOURCES = $(test_SOURCES) DIST_SOURCES = $(test_SOURCES) $(test_bh_SOURCES) $(test_qr_SOURCES)
am__can_run_installinfo = \ am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \ case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \ n|no|NO) false;; \
...@@ -270,13 +282,25 @@ top_srcdir = @top_srcdir@ ...@@ -270,13 +282,25 @@ top_srcdir = @top_srcdir@
AUTOMAKE_OPTIONS = gnu AUTOMAKE_OPTIONS = gnu
# Add the source directory and debug to CFLAGS # Add the source directory and debug to CFLAGS
AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=2.67e9 AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=2.67e9 \
AM_LDFLAGS = -lm -fsanitize=address -fno-omit-frame-pointer
AM_LDFLAGS = -lm -fsanitize=address
# Sources for test # Sources for test
test_SOURCES = test.c test_SOURCES = test.c
test_CFLAGS = $(AM_CFLAGS) test_CFLAGS = $(AM_CFLAGS)
test_LDADD = ../src/.libs/libquicksched.a test_LDADD = ../src/.libs/libquicksched.a
# Sources for test_qr
test_qr_SOURCES = test_qr.c
test_qr_CFLAGS = $(AM_CFLAGS)
test_qr_LDADD = ../src/.libs/libquicksched.a -lblas
# Sources for test_bh
test_bh_SOURCES = test_bh.c
test_bh_CFLAGS = $(AM_CFLAGS)
test_bh_LDADD = ../src/.libs/libquicksched.a
all: all-am all: all-am
.SUFFIXES: .SUFFIXES:
...@@ -360,6 +384,12 @@ clean-binPROGRAMS: ...@@ -360,6 +384,12 @@ clean-binPROGRAMS:
test$(EXEEXT): $(test_OBJECTS) $(test_DEPENDENCIES) $(EXTRA_test_DEPENDENCIES) test$(EXEEXT): $(test_OBJECTS) $(test_DEPENDENCIES) $(EXTRA_test_DEPENDENCIES)
@rm -f test$(EXEEXT) @rm -f test$(EXEEXT)
$(test_LINK) $(test_OBJECTS) $(test_LDADD) $(LIBS) $(test_LINK) $(test_OBJECTS) $(test_LDADD) $(LIBS)
test_bh$(EXEEXT): $(test_bh_OBJECTS) $(test_bh_DEPENDENCIES) $(EXTRA_test_bh_DEPENDENCIES)
@rm -f test_bh$(EXEEXT)
$(test_bh_LINK) $(test_bh_OBJECTS) $(test_bh_LDADD) $(LIBS)
test_qr$(EXEEXT): $(test_qr_OBJECTS) $(test_qr_DEPENDENCIES) $(EXTRA_test_qr_DEPENDENCIES)
@rm -f test_qr$(EXEEXT)
$(test_qr_LINK) $(test_qr_OBJECTS) $(test_qr_LDADD) $(LIBS)
mostlyclean-compile: mostlyclean-compile:
-rm -f *.$(OBJEXT) -rm -f *.$(OBJEXT)
...@@ -368,6 +398,8 @@ distclean-compile: ...@@ -368,6 +398,8 @@ distclean-compile:
-rm -f *.tab.c -rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-test.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test-test.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_bh-test_bh.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_qr-test_qr.Po@am__quote@
.c.o: .c.o:
@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
...@@ -404,6 +436,34 @@ test-test.obj: test.c ...@@ -404,6 +436,34 @@ test-test.obj: test.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_CFLAGS) $(CFLAGS) -c -o test-test.obj `if test -f 'test.c'; then $(CYGPATH_W) 'test.c'; else $(CYGPATH_W) '$(srcdir)/test.c'; fi` @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_CFLAGS) $(CFLAGS) -c -o test-test.obj `if test -f 'test.c'; then $(CYGPATH_W) 'test.c'; else $(CYGPATH_W) '$(srcdir)/test.c'; fi`
test_bh-test_bh.o: test_bh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_bh_CFLAGS) $(CFLAGS) -MT test_bh-test_bh.o -MD -MP -MF $(DEPDIR)/test_bh-test_bh.Tpo -c -o test_bh-test_bh.o `test -f 'test_bh.c' || echo '$(srcdir)/'`test_bh.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/test_bh-test_bh.Tpo $(DEPDIR)/test_bh-test_bh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='test_bh.c' object='test_bh-test_bh.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_bh_CFLAGS) $(CFLAGS) -c -o test_bh-test_bh.o `test -f 'test_bh.c' || echo '$(srcdir)/'`test_bh.c
test_bh-test_bh.obj: test_bh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_bh_CFLAGS) $(CFLAGS) -MT test_bh-test_bh.obj -MD -MP -MF $(DEPDIR)/test_bh-test_bh.Tpo -c -o test_bh-test_bh.obj `if test -f 'test_bh.c'; then $(CYGPATH_W) 'test_bh.c'; else $(CYGPATH_W) '$(srcdir)/test_bh.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/test_bh-test_bh.Tpo $(DEPDIR)/test_bh-test_bh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='test_bh.c' object='test_bh-test_bh.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_bh_CFLAGS) $(CFLAGS) -c -o test_bh-test_bh.obj `if test -f 'test_bh.c'; then $(CYGPATH_W) 'test_bh.c'; else $(CYGPATH_W) '$(srcdir)/test_bh.c'; fi`
test_qr-test_qr.o: test_qr.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_qr_CFLAGS) $(CFLAGS) -MT test_qr-test_qr.o -MD -MP -MF $(DEPDIR)/test_qr-test_qr.Tpo -c -o test_qr-test_qr.o `test -f 'test_qr.c' || echo '$(srcdir)/'`test_qr.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/test_qr-test_qr.Tpo $(DEPDIR)/test_qr-test_qr.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='test_qr.c' object='test_qr-test_qr.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_qr_CFLAGS) $(CFLAGS) -c -o test_qr-test_qr.o `test -f 'test_qr.c' || echo '$(srcdir)/'`test_qr.c
test_qr-test_qr.obj: test_qr.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_qr_CFLAGS) $(CFLAGS) -MT test_qr-test_qr.obj -MD -MP -MF $(DEPDIR)/test_qr-test_qr.Tpo -c -o test_qr-test_qr.obj `if test -f 'test_qr.c'; then $(CYGPATH_W) 'test_qr.c'; else $(CYGPATH_W) '$(srcdir)/test_qr.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/test_qr-test_qr.Tpo $(DEPDIR)/test_qr-test_qr.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='test_qr.c' object='test_qr-test_qr.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_qr_CFLAGS) $(CFLAGS) -c -o test_qr-test_qr.obj `if test -f 'test_qr.c'; then $(CYGPATH_W) 'test_qr.c'; else $(CYGPATH_W) '$(srcdir)/test_qr.c'; fi`
mostlyclean-libtool: mostlyclean-libtool:
-rm -f *.lo -rm -f *.lo
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
tpms = 1/2.6e6; tpms = 1/2.6e6;
ms_max = 30; ms_max = 30;
%% Plot the matrix-multiply tasks
%% Plot the task timelines for tasks allocation %% Plot the task timelines for tasks allocation
% Load the data % Load the data
tasks = importdata( 'test.dump' ); tasks = importdata( 'test.dump' );
...@@ -10,6 +12,7 @@ tasks(:,6) = ( tasks(:,6) - tasks(:,5) ) * tpms; ...@@ -10,6 +12,7 @@ tasks(:,6) = ( tasks(:,6) - tasks(:,5) ) * tpms;
start = min( tasks(:,5) ); start = min( tasks(:,5) );
tasks(:,5) = ( tasks(:,5) - start ) * tpms; tasks(:,5) = ( tasks(:,5) - start ) * tpms;
nr_cores = max( tasks(:,2) ) + 1; nr_cores = max( tasks(:,2) ) + 1;
maxd = max( [ tasks(:,3) ; tasks(:,4) ] );
% Init the plot % Init the plot
clf; clf;
...@@ -19,7 +22,7 @@ hold on; ...@@ -19,7 +22,7 @@ hold on;
% Plot the tasks % Plot the tasks
for k=1:size(tasks,1) for k=1:size(tasks,1)
rectangle( 'Position' , [ tasks(k,5) , tasks(k,2)+0.5 , tasks(k,6) , 1 ] , ... rectangle( 'Position' , [ tasks(k,5) , tasks(k,2)+0.5 , tasks(k,6) , 1 ] , ...
'EdgeColor' , [ 0 0.8 0 ] , 'LineWidth' , 1 , 'FaceColor' , [ tasks(k,3)/3 , tasks(k,4)/3 , 0 ] ); 'EdgeColor' , [ 0 0.8 0 ] , 'LineWidth' , 1 , 'FaceColor' , [ tasks(k,3)/maxd , tasks(k,4)/maxd , 0 ] );
text( tasks(k,5) + tasks(k,6)*0.5 , tasks(k,2)+1 , ... text( tasks(k,5) + tasks(k,6)*0.5 , tasks(k,2)+1 , ...
sprintf( '%i,%i' , tasks(k,3) , tasks(k,4) ) , ... sprintf( '%i,%i' , tasks(k,3) , tasks(k,4) ) , ...
'HorizontalAlignment' , 'Center' ); 'HorizontalAlignment' , 'Center' );
...@@ -36,8 +39,87 @@ axis([ 0 , max( tasks(:,5) + tasks(:,6) ) , 0.5 , nr_cores+0.5 ]); ...@@ -36,8 +39,87 @@ axis([ 0 , max( tasks(:,5) + tasks(:,6) ) , 0.5 , nr_cores+0.5 ]);
% Print this plot % Print this plot
set( gcf , 'PaperSize' , 2.3*[ 16 4 ] ); set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 16 4 ] ); set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 16 4 ] );
print -depsc2 tasks_dynamic.eps print -depsc2 tasks_mm_dynamic.eps
!epstopdf tasks_dynamic.eps !epstopdf tasks_mm_dynamic.eps
%% Plot the tiled QR tasks
%% Plot the task timelines for tasks allocation
% Load the data
tasks = importdata( 'test.dump' );
tasks(:,6) = ( tasks(:,6) - tasks(:,5) ) * tpms;
start = min( tasks(:,5) );
tasks(:,5) = ( tasks(:,5) - start ) * tpms;
nr_cores = max( tasks(:,2) ) + 1;
maxd = max( [ tasks(:,3) ; tasks(:,4) ] );
% Init the plot
clf;
subplot('position',[ 0.05 , 0.1 , 0.9 , 0.8 ]);
colours = [ 1 0 0 ; 1 1 0 ; 0 1 0 ; 0 0 1 ];
hold on;
% Plot the tasks
for k=1:size(tasks,1)
c = colours( tasks(k,1)+1 , : );
rectangle( 'Position' , [ tasks(k,5) , tasks(k,2)-0.5 , tasks(k,6) , 1 ] , ...
'EdgeColor' , 0.8*c , 'LineWidth' , 1 , 'FaceColor' , c );
end
% Set the axes and stuff.
hold off;
xlabel('time (ms)');
ylabel('core ID');
set(gca,'YTick',1:(max(tasks(:,1))+1))
title('tiled QR decomposition tasks');
axis([ 0 , max( tasks(:,5) + tasks(:,6) ) , -0.5 , nr_cores-0.5 ]);
% Print this plot
set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 16 4 ] );
print -depsc2 tasks_qr_dynamic.eps
!epstopdf tasks_qr_dynamic.eps
%% Plot the tiled Barnes-Hutt tasks
%% Plot the task timelines for tasks allocation
% Load the data
tasks = importdata( 'test.dump' );
tasks(:,4) = ( tasks(:,4) - tasks(:,3) ) * tpms;
start = min( tasks(:,3) );
tasks(:,3) = ( tasks(:,3) - start ) * tpms;
nr_cores = max( tasks(:,2) ) + 1;
% Init the plot
clf;
subplot('position',[ 0.05 , 0.1 , 0.9 , 0.8 ]);
colours = [ 1 0 0 ; 0 1 0 ; 0 0 1 ; 1 1 0 ];
hold on;
% Plot the tasks
for k=1:size(tasks,1)
c = colours( tasks(k,1)+1 , : );
rectangle( 'Position' , [ tasks(k,3) , tasks(k,2)-0.5 , tasks(k,4) , 1 ] , ...
'EdgeColor' , 0.8*c , 'LineWidth' , 1 , 'FaceColor' , c );
end
% Set the axes and stuff.
hold off;
xlabel('time (ms)');
ylabel('core ID');
set(gca,'YTick',1:(max(tasks(:,1))+1))
title('Barnes-Hutt tasks');
axis([ 0 , max( tasks(:,3) + tasks(:,4) ) , -0.5 , nr_cores-0.5 ]);
% Print this plot
set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 16 4 ] );
print -depsc2 tasks_bh_dynamic.eps
!epstopdf tasks_bh_dynamic.eps
...@@ -22,15 +22,13 @@ ...@@ -22,15 +22,13 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <unistd.h>
#include <math.h> #include <math.h>
#include <omp.h> #include <omp.h>
/* Local includes. */ /* Local includes. */
#include "quicksched.h" #include "quicksched.h"
/* Error macro. */
#define error(s) { fprintf( stderr , "%s:%s():%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); }
/** /**
* @brief Matrix multiplication kernel. * @brief Matrix multiplication kernel.
...@@ -41,7 +39,7 @@ void matmul ( int m , int n , int k , double *a , int lda , double *b , int ldb ...@@ -41,7 +39,7 @@ void matmul ( int m , int n , int k , double *a , int lda , double *b , int ldb
int ii, jj, kk; int ii, jj, kk;
double acc; double acc;
// printf( "matmul: m=%i, n=%i, k=%i, lda=%i, ldb=%i, ldc=%i.\n" , // message( "matmul: m=%i, n=%i, k=%i, lda=%i, ldb=%i, ldc=%i." ,
// m , n , k , lda , ldb , ldc ); fflush(stdout); // m , n , k , lda , ldb , ldc ); fflush(stdout);
for ( ii = 0 ; ii < m ; ii++ ) for ( ii = 0 ; ii < m ; ii++ )
...@@ -58,7 +56,7 @@ void matmul ( int m , int n , int k , double *a , int lda , double *b , int ldb ...@@ -58,7 +56,7 @@ void matmul ( int m , int n , int k , double *a , int lda , double *b , int ldb
* @brief First test: Just tasks, no dependencies or conflicts. * @brief First test: Just tasks, no dependencies or conflicts.
* *
* Computes a tiled matrix multiplication of the form * Computes a tiled matrix multiplication of the form
* C_ij = A_i: * B_:j, with k taskx C_ij += A_ik*B_kj. * C_ij = A_i: * B_:j, with tasks C_ij += A_ik*B_kj.
*/ */
void test2 ( int m , int n , int k , int nr_threads ) { void test2 ( int m , int n , int k , int nr_threads ) {
...@@ -69,6 +67,10 @@ void test2 ( int m , int n , int k , int nr_threads ) { ...@@ -69,6 +67,10 @@ void test2 ( int m , int n , int k , int nr_threads ) {
double *a, *b, *c, *res, err = 0.0, irm = 1.0/RAND_MAX; double *a, *b, *c, *res, err = 0.0, irm = 1.0/RAND_MAX;
ticks tic_task, toc_task, tic_ref, toc_ref; ticks tic_task, toc_task, tic_ref, toc_ref;
/* Tell the user something about the test. */
message( "computing a tiled matrix multiplication of the form "
"C_ij = A_i: * B_:j, with tasks for each k where C_ij += A_ik*B_kj." );
/* Init the sched. */ /* Init the sched. */
bzero( &s , sizeof(struct sched) ); bzero( &s , sizeof(struct sched) );
sched_init( &s , nr_threads , m * n ); sched_init( &s , nr_threads , m * n );
...@@ -91,11 +93,11 @@ void test2 ( int m , int n , int k , int nr_threads ) { ...@@ -91,11 +93,11 @@ void test2 ( int m , int n , int k , int nr_threads ) {
/* Build a task for each tile of the matrix c. */ /* Build a task for each tile of the matrix c. */
for ( i = 0 ; i < m ; i++ ) for ( i = 0 ; i < m ; i++ )
for ( j = 0 ; j < n ; j++ ) { for ( j = 0 ; j < n ; j++ ) {
rid = sched_addres( &s ); rid = sched_addres( &s , -1 );
data[0] = i; data[1] = j; data[0] = i; data[1] = j;
for ( kk = 0 ; kk < k ; kk++ ) { for ( kk = 0 ; kk < k ; kk++ ) {
data[2] = kk; data[2] = kk;
tid = sched_newtask( &s , 1 , 0 , 0 , data , 3*sizeof(int) ); tid = sched_newtask( &s , 1 , 0 , 0 , data , 3*sizeof(int) , 1 );
sched_addlock( &s , tid , rid ); sched_addlock( &s , tid , rid );
} }
} }
...@@ -122,7 +124,7 @@ void test2 ( int m , int n , int k , int nr_threads ) { ...@@ -122,7 +124,7 @@ void test2 ( int m , int n , int k , int nr_threads ) {
switch ( t->type ) { switch ( t->type ) {
case 1: case 1:
d = sched_getdata( &s , t ); d = sched_getdata( &s , t );
// printf( "test2[%02i]: working on block [ %i , %i ] with k=%i, lock[0]=%i.\n" , qid , d[0] , d[1] , d[2] , t->locks[0] ); fflush(stdout); // message( "thread %i working on block [ %i , %i ] with k=%i, lock[0]=%i." , qid , d[0] , d[1] , d[2] , t->locks[0] ); fflush(stdout);
matmul( 32 , 32 , 32 , &a[ d[2]*32*m*32 + d[0]*32 ] , m*32 , &b[ k*32*d[1]*32 + d[2]*32 ] , k*32 , &c[ d[0]*32 + m*32*d[1]*32 ] , m*32 ); matmul( 32 , 32 , 32 , &a[ d[2]*32*m*32 + d[0]*32 ] , m*32 , &b[ k*32*d[1]*32 + d[2]*32 ] , k*32 , &c[ d[0]*32 + m*32*d[1]*32 ] , m*32 );
break; break;
default: default:
...@@ -145,9 +147,9 @@ void test2 ( int m , int n , int k , int nr_threads ) { ...@@ -145,9 +147,9 @@ void test2 ( int m , int n , int k , int nr_threads ) {
toc_ref = getticks(); toc_ref = getticks();
for ( i = 0 ; i < m * n * 32 * 32 ; i++ ) for ( i = 0 ; i < m * n * 32 * 32 ; i++ )
err += ( res[i] - c[i] ) * ( res[i] - c[i] ); err += ( res[i] - c[i] ) * ( res[i] - c[i] );
printf( "test2: Frob. norm of error is %.3e.\n" , sqrt( err ) ); message( "Frob. norm of error is %.3e." , sqrt( err ) );
printf( "test2: tasks took %lli ticks.\n" , toc_task - tic_task ); message( "tasks took %lli ticks." , toc_task - tic_task );
printf( "test2: ref. took %lli ticks.\n" , toc_ref - tic_ref ); message( "ref. took %lli ticks." , toc_ref - tic_ref );
/* Dump the tasks. */ /* Dump the tasks. */
/* for ( k = 0 ; k < s.count ; k++ ) { /* for ( k = 0 ; k < s.count ; k++ ) {
...@@ -180,6 +182,10 @@ void test1 ( int m , int n , int k , int nr_threads ) { ...@@ -180,6 +182,10 @@ void test1 ( int m , int n , int k , int nr_threads ) {
double *a, *b, *c, *res, err = 0.0, irm = 1.0/RAND_MAX; double *a, *b, *c, *res, err = 0.0, irm = 1.0/RAND_MAX;
ticks tic_task, toc_task, tic_ref, toc_ref; ticks tic_task, toc_task, tic_ref, toc_ref;
/* Tell the user something about the test. */
message( "computing a tiled matrix multiplication of the form "
"C_ij = A_i: * B_:j, with a single task per C_ij." );
/* Init the sched. */ /* Init the sched. */
bzero( &s , sizeof(struct sched) ); bzero( &s , sizeof(struct sched) );
sched_init( &s , nr_threads , m * n ); sched_init( &s , nr_threads , m * n );
...@@ -203,8 +209,8 @@ void test1 ( int m , int n , int k , int nr_threads ) { ...@@ -203,8 +209,8 @@ void test1 ( int m , int n , int k , int nr_threads ) {
for ( i = 0 ; i < m ; i++ ) for ( i = 0 ; i < m ; i++ )
for ( j = 0 ; j < n ; j++ ) { for ( j = 0 ; j < n ; j++ ) {
data[0] = i; data[1] = j; data[0] = i; data[1] = j;
rid = sched_addres( &s ); rid = sched_addres( &s , -1 );
tid = sched_newtask( &s , 1 , 0 , 0 , data , 2*sizeof(int) ); tid = sched_newtask( &s , 1 , 0 , 0 , data , 2*sizeof(int) , 1 );
sched_addlock( &s , tid , rid ); sched_addlock( &s , tid , rid );
} }
...@@ -230,7 +236,7 @@ void test1 ( int m , int n , int k , int nr_threads ) { ...@@ -230,7 +236,7 @@ void test1 ( int m , int n , int k , int nr_threads ) {
switch ( t->type ) { switch ( t->type ) {
case 1: case 1:
d = sched_getdata( &s , t ); d = sched_getdata( &s , t );
// printf( "test1[%02i]: working on block [ %i , %i ].\n" , qid , d[0] , d[1] ); fflush(stdout); // message( "thread %i working on block [ %i , %i ]." , qid , d[0] , d[1] ); fflush(stdout);
matmul( 32 , 32 , k*32 , &a[ d[0]*32 ] , m*32 , &b[ k*32*d[1]*32 ] , k*32 , &c[ d[0]*32 + m*32*d[1]*32 ] , m*32 ); matmul( 32 , 32 , k*32 , &a[ d[0]*32 ] , m*32 , &b[ k*32*d[1]*32 ] , k*32 , &c[ d[0]*32 + m*32*d[1]*32 ] , m*32 );
break; break;
default: default:
...@@ -253,9 +259,9 @@ void test1 ( int m , int n , int k , int nr_threads ) { ...@@ -253,9 +259,9 @@ void test1 ( int m , int n , int k , int nr_threads ) {
toc_ref = getticks(); toc_ref = getticks();
for ( i = 0 ; i < m * n * 32 * 32 ; i++ ) for ( i = 0 ; i < m * n * 32 * 32 ; i++ )
err += ( res[i] - c[i] ) * ( res[i] - c[i] ); err += ( res[i] - c[i] ) * ( res[i] - c[i] );
printf( "test1: Frob. norm of error is %.3e.\n" , sqrt( err ) ); message( "Frob. norm of error is %.3e." , sqrt( err ) );
printf( "test1: tasks took %lli ticks.\n" , toc_task - tic_task ); message( "tasks took %lli ticks." , toc_task - tic_task );
printf( "test1: ref. took %lli ticks.\n" , toc_ref - tic_ref ); message( "ref. took %lli ticks." , toc_ref - tic_ref );
/* Dump the tasks. */ /* Dump the tasks. */
/* for ( k = 0 ; k < s.count ; k++ ) { /* for ( k = 0 ; k < s.count ; k++ ) {
...@@ -279,16 +285,47 @@ void test1 ( int m , int n , int k , int nr_threads ) { ...@@ -279,16 +285,47 @@ void test1 ( int m , int n , int k , int nr_threads ) {
int main ( int argc , char *argv[] ) { int main ( int argc , char *argv[] ) {
int nr_threads; int c, nr_threads;
int M = 4, N = 4, K = 4; int M = 4, N = 4, K = 4;
/* Get the number of threads. */ /* Get the number of threads. */
#pragma omp parallel shared(nr_threads) #pragma omp parallel shared(nr_threads)
{ {
#pragma omp single if ( omp_get_thread_num() == 0 )
nr_threads = omp_get_num_threads(); nr_threads = omp_get_num_threads();
} }
/* Parse the options */
while ( ( c = getopt( argc , argv , "m:n:k:t:" ) ) != -1 )
switch( c ) {
case 'k':
if ( sscanf( optarg , "%d" , &K ) != 1 )
error( "Error parsing dimension M." );
break;
case 'm':
if ( sscanf( optarg , "%d" , &M ) != 1 )
error( "Error parsing dimension M." );
break;
case 'n':
if ( sscanf( optarg , "%d" , &N ) != 1 )
error( "Error parsing dimension M." );
break;
case 't':
if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
error( "Error parsing number of threads." );
omp_set_num_threads( nr_threads );
break;
case '?':
fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
fprintf( stderr , "Computes tests with nr_threads threads for the multiplication\n"
"of a matrix of size MxK and of size KxN tiles of size 32x32.\n" );
exit( EXIT_FAILURE );
}
/* Dump arguments. */
message( "multiplying two matrices of size %ix%i and %ix%i using %i threads." ,
32*M , 32*K , 32*K , 32*N , nr_threads );
/* Call the first test. */ /* Call the first test. */
test1( M , N , K , nr_threads ); test1( M , N , K , nr_threads );
......
This diff is collapsed.
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <omp.h>
/* Local includes. */
#include "quicksched.h"
/* Prototypes for BLAS function. */
void dtrmm_ ( char * , char * , char * , char * , int * , int * , double * , double * , int * , double * , int * );
/*
* Sam's routines for the tiled QR decomposition.
*/
/*
\brief Computes 2-norm of a vector \f$x\f$
Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
*/
double do2norm(double* x, int l)
{
double sum = 0, norm;
int i;
for(i = 0; i < l; i++)
sum += x[i] * x[i];
norm = sqrt(sum);
return norm;
}
/**
* \brief Computes a Householder reflector from a pair of vectors from coupled blocks
*
* Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
* There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
*
* \param topDiag The only non-zero element of the incoming vector in the top block
* \param ma The number of elements in the top vector
* \param xb Pointer to the lower vector
* \param l The number of elements in the whole vector
* \param vk A pointer to a pre-allocated array to store the householder vector of size l
*
* \returns void
*/
void calcvkDouble (double topDiag,
int ma,
double* xb,
int l,
double* vk)
{
int sign, i;
double norm, div;
//same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
sign = topDiag >= 0.0 ? 1 : -1;
vk[0] = topDiag;
//use vk[0] as beta
for(i = 1; i < ma; i++)
vk[i] = 0;
for(; i < l; i++)
vk[i] = xb[i - ma];
norm = do2norm(vk, l);
vk[0] += norm * sign;
if(norm != 0.0)
{
div = 1/vk[0];
for(i = 1; i < l; i++)
vk[i] *= div;
}
}
/**
* \brief Computes a Householder reflector \f$v\f$ of a vector \f$x\f$ for a single block
*
* Computes: \f[v = \textrm{sign}(x_1)||x||_2e_1+x\f]
* Then does a non-standard normalisation \f$v\f$: \f[v = \frac{v}{v_1}\f]
*
* \param x Pointer to an array containing a column vector to compute the Householder reflector of
* \param l The number of elements in \f$x\f$
* \param vk A pointer to an allocated array to store the resulting vector of size l - 1
* due to the implied 1 as the first element
*
* \returns void
*/
void calcvkSingle (double* x,
int l,
double* vk)
{
int sign, i;
double norm, div, beta;
sign = x[0] >= 0.0 ? 1 : -1;
beta = x[0];
//copy the values
for(i = 1; i < l; i++)
vk[i-1] = x[i];
//take the euclidian norm of the original vector
norm = do2norm(x, l);
//calculate the new normalisation
beta += norm * sign;
if(norm != 0.0)
{
//normalise
div = 1/beta;
for(i = 0; i < l-1; i++)
vk[i] *= div;
}
}
void updateDoubleQ_WY (double* blockA,
double* blockB,
double* blockTau,
int k, int ma, int mb, int n,
int ldm,
double* hhVector)//bottom, essential part.
{
int i, j;
double tau = 1.0, beta;
/* Compute tau = 2/v'v */
for(i = 0; i < mb; i ++)
tau += hhVector[i] * hhVector[i];
tau = 2/tau;
for(j = k; j < n; j ++)
{
/* Compute v'*b_j */
beta = blockA[(j*ldm) + k];
/* Then for lower half */
for(i = 0; i < mb; i ++)
beta += blockB[(j*ldm) + i] * hhVector[i];
beta *= tau;
/* Compute b_j = b_j - beta*v_k */
blockA[(j*ldm) + k] -= beta;
for(i = 0; i < mb; i ++)
blockB[(j*ldm) + i] -= beta * hhVector[i];
}
/* Insert vector below diagonal. */
for(i = 0; i < mb; i ++)
blockB[(k*ldm) + i] = hhVector[i];
blockTau[k] = tau;
}
void updatekthSingleWY (double* blockV,
double* tauBlock,
double beta,
int k,
int m, int n, int ldm,
double* w)
{
/* Insert beta on the diagonal of Tau */
tauBlock[k] = beta;
}
void updateSingleQ_WY (double* block,
double* tauBlock,
int k,
int m, int n, int ldm,//dims of block
double* workVector)
{
/* Compute A = A - 2/v'v*vv'A */
int i, j;
double beta = 1.0f, prod;
for(i = k + 1; i < m; i ++)
{
beta += workVector[i - k - 1] * workVector[i - k - 1];
}
/* Finish computation of 2/v'v */
beta = (-2)/beta;
for(j = k; j < 32; j ++)
{
/* Compute prod = v'A_j */
prod = block[(j*ldm) + k];//(k,k) to (k,n)
for(i = k + 1; i < m; i ++)
prod += block[(j*ldm) + i] * workVector[i - k - 1];
/* Compute A_j = A_j - beta*v*prod */
block[(j*ldm) + k] += beta * prod;
for(i = k + 1; i < m; i ++)
block[(j*ldm) + i] += beta * prod * workVector[i - k - 1];
}
/* Insert nonessential vector below diagonal. */
for(i = k + 1; i < m; i ++)
block[(k*ldm) + i] = workVector[i - k - 1];
updatekthSingleWY (block,
tauBlock,
-beta,
k, m, n, ldm,
workVector);
}
void DTSQRF (double* blockA,
double* blockB,
double* blockTau,
int ma,
int mb,
int n,
int ldm,
double* hhVector)
{
int k;
double* xVectA, *xVectB;
xVectA = blockA;
xVectB = blockB;
for(k = 0; k < n; k++)
{
//vk = sign(x[1])||x||_2e1 + x
//vk = vk/vk[0]
calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential
//matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
//update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
updateDoubleQ_WY (blockA, blockB,
blockTau,
k, ma, mb, n,
ldm,
hhVector + ma - k);
xVectA += ldm + 1;
xVectB += ldm;
}
}
void DSSRFT (double* blockV,
double* blockA, double* blockB,
double* blockTau,
int b, int n, int ldm)
{
int i, j, k;
double tau, beta;
/* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
and for each householder vector v of blockV */
/* For each column of B */
for(j = 0; j < n; j ++)
{
/* For each householder vector. */
for(k = 0; k < n; k ++)
{
/* tau = 2/v'v, computed earlier, stored in T(k,k). */
tau = blockTau[k];
/* Compute beta = v_k'b_j. */
/* v_k is >0 (=1) only at position k in top half. */
beta = blockA[(j*ldm) + k];
/* For lower portion of v_k, aligning with the lower block */
for(i = 0; i < b; i ++)
beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i];
beta *= tau;
/* Compute b_j = b_j - beta * v */
/* v_k = 1 at (k) in top half again */
blockA[(j*ldm) + k] -= beta;
/* Apply to bottom block. */
for(i = 0; i < b; i ++)
blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
}
}
}
void DGEQRF (double* block,
double* tauBlock,
int m, int n, int ldm,
double* workVector)
{
int k;
double* xVect;
xVect = block;
for(k = 0; k < n; k ++)
{
/* Get kth householder vector into position starting at workVector */
calcvkSingle(xVect, m-k, workVector);
/* Apply householder vector (with an implied 1 in first element to block,
generating WY matrices in the process.
Stores vector below the diagonal. */
updateSingleQ_WY (block, tauBlock,
k, m, n, ldm,
workVector);
/* Shift one along & one down */
xVect += ldm + 1;
}
}
void DLARFT (double* block,
double* blockV,
double* tauBlock,
int m, int n, int ldm)
{
/* Perform the transformation block = block - blockV*(tauBlock*(blockV^T*block))
Equivalent to B = B - V(T(V^TB))
Noting that T is upper triangular, and V is unit lower triangular. */
int i, j, k;
double tau, beta;
/* For each column of the block. */
for(j = 0; j < n; j ++)
{
/* Apply successive reflectors with b_j - tau_k*v_k*v_k'b_j */
for(k = 0; k < n; k ++)
{
/* tau_k is at blockV(k) */
tau = tauBlock[k];
/* Compute v_k'*b_j, with v_k,k = 1 implied */
beta = block[(j*ldm) + k];//*1.0
/* Rest of vector. */
for(i = k+1; i < m; i ++)
beta += blockV[(k*ldm) + i] * block[(j*ldm) + i];
beta *= tau;
/* Compute b_j = b_j - beta*v_k, again with an implied 1 at v_kk */
block[(j*ldm) + k] -= beta;/* *1.0 */
/* Compute for rest of b_j */
for(i = k+1; i < m; i ++)
block[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
}
}
}
/**
* @brief Computed a tiled QR factorization using QuickSched.
*
* @param m Number of tile rows.
* @param n Number of tile columns.
* @param nr_threads Number of threads to use.
*/
void test_qr ( int m , int n , int nr_threads ) {
int k, j, i;
double *A, *A_orig, *tau;
struct sched s;
int *tid, *rid, tid_new;
int data[3];
enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT };
/* Allocate and fill the original matrix. */
if ( ( A = (double *)malloc( sizeof(double) * m * n * 32 * 32 ) ) == NULL ||
( tau = (double *)malloc( sizeof(double) * m * n * 32 ) ) == NULL ||
( A_orig = (double *)malloc( sizeof(double) * m * n * 32 * 32 ) ) == NULL )
error( "Failed to allocate matrices." );
for ( k = 0 ; k < m * n * 32 * 32 ; k++ )
A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
memcpy( A , A_orig , sizeof(double) * m * n * 32 * 32 );
bzero( tau , sizeof(double) * m * n * 32 );
/* Dump A_orig. */
/* message( "A_orig = [" );
for ( k = 0 ; k < m*32 ; k++ ) {
for ( j = 0 ; j < n*32 ; j++ )
printf( "%.3f " , A_orig[ j*m*32 + k ] );
printf( "\n" );
}
printf( "];\n" ); */
/* Initialize the scheduler. */
sched_init( &s , nr_threads , m*n );
/* Allocate and init the task ID and resource ID matrix. */
if ( ( tid = (int *)malloc( sizeof(int) * m * n ) ) == NULL ||
( rid = (int *)malloc( sizeof(int) * m * n ) ) == NULL )
error( "Failed to allocate tid/rid matrix." );
for ( k = 0 ; k < m * n ; k++ ) {
tid[k] = -1;
rid[k] = sched_addres( &s , -1 );
}
/* Build the tasks. */
for ( k = 0 ; k < m && k < n ; k++ ) {
/* Add kth corner task. */
data[0] = k; data[1] = k; data[2] = k;
tid_new = sched_newtask( &s , task_DGEQRF , 0 , 0 , data , sizeof(int)*3 , 2 );
sched_addlock( &s , tid_new , rid[ k*m + k ] );
if ( tid[ k*m + k ] != -1 )
sched_addunlock( &s , tid[ k*m + k ] , tid_new );
tid[ k*m + k ] = tid_new;
/* Add column tasks on kth row. */
for ( j = k+1 ; j < n ; j++ ) {
data[0] = k; data[1] = j; data[2] = k;
tid_new = sched_newtask( &s , task_DLARFT , 0 , 0 , data , sizeof(int)*3 , 3 );
sched_addlock( &s , tid_new , rid[ j*m + k ] );
sched_adduse( &s , tid_new , rid[ k*m + k ] );
sched_addunlock( &s , tid[ k*m + k ] , tid_new );
if ( tid[ j*m + k ] != -1 )
sched_addunlock( &s , tid[ j*m + k ] , tid_new );
tid[ j*m + k ] = tid_new;
}
/* For each following row... */
for ( i = k+1 ; i < m ; i++ ) {
/* Add the row taks for the kth column. */
data[0] = i; data[1] = k; data[2] = k;
tid_new = sched_newtask( &s , task_DTSQRF , 0 , 0 , data , sizeof(int)*3 , 3 );
sched_addlock( &s , tid_new , rid[ k*m + i ] );
sched_adduse( &s , tid_new , rid[ k*m + k ] );
sched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
if ( tid[ k*m + i ] != -1 )
sched_addunlock( &s , tid[ k*m + i ] , tid_new );
tid[ k*m + i ] = tid_new;
/* Add the inner tasks. */
for ( j = k+1 ; j < n ; j++ ) {
data[0] = i; data[1] = j; data[2] = k;
tid_new = sched_newtask( &s , task_DSSRFT , 0 , 0 , data , sizeof(int)*3 , 5 );
sched_addlock( &s , tid_new , rid[ j*m + i ] );
sched_adduse( &s , tid_new , rid[ k*m + i ] );
sched_adduse( &s , tid_new , rid[ j*m + k ] );
sched_addunlock( &s , tid[ k*m + i ] , tid_new );
sched_addunlock( &s , tid[ j*m + k ] , tid_new );
if ( tid[ j*m + i ] != -1 )
sched_addunlock( &s , tid[ j*m + i ] , tid_new );
tid[ j*m + i ] = tid_new;
}
}
} /* build the tasks. */
/* Prepare the scheduler. */
sched_prepare( &s );
/* Parallel loop. */
#pragma omp parallel
{
int *d, qid;
double buff[ 2*32*32 ];
struct task *t;
/* Get the ID of this runner. */
if ( ( qid = omp_get_thread_num() ) < nr_threads ) {
/* Main loop. */
while ( 1 ) {
/* Get a task, break if unsucessful. */
if ( ( t = sched_gettask( &s , qid ) ) == NULL )
break;
/* Get the task's data. */
d = sched_getdata( &s , t );
i = d[0]; j = d[1]; k = d[2];
/* Decode and execute the task. */
switch ( t->type ) {
case task_DGEQRF:
DGEQRF( &A[ j*m*32*32 + i*32 ] , &tau[ j*m*32 + i*32 ] , 32 , 32 , 32*m , buff );
break;
case task_DLARFT:
DLARFT( &A[ j*m*32*32 + i*32 ] , &A[ i*m*32*32 + i*32 ] , &tau[ i*m*32 + i*32 ] , 32 , 32 , 32*m );
break;
case task_DTSQRF:
DTSQRF( &A[ j*m*32*32 + j*32 ] , &A[ j*m*32*32 + i*32 ] , &tau[ j*m*32 + i*32 ] , 32 , 32 , 32 , 32*m , buff );
break;
case task_DSSRFT:
DSSRFT( &A[ k*m*32 + i*32 ] , &A[ j*m*32*32 + k*32 ] , &A[ j*m*32*32 + i*32 ] , &tau[ k*m*32 + i*32 ] , 32 , 32 , 32*m );
break;
default:
error( "Unknown task type." );
}
/* Clean up afterwards. */
sched_done( &s , t );
} /* main loop. */
} /* valid thread. */
} /* parallel loop. */
/* Dump A. */
/* message( "A = [" );
for ( k = 0 ; k < m*32 ; k++ ) {
for ( j = 0 ; j < n*32 ; j++ )
printf( "%.3f " , A[ j*m*32 + k ] );
printf( "\n" );
}
printf( "];\n" ); */
/* Dump tau. */
/* message( "tau = [" );
for ( k = 0 ; k < m*32 ; k++ ) {
for ( j = 0 ; j < n ; j++ )
printf( "%.3f " , tau[ j*m*32 + k ] );
printf( "\n" );
}
printf( "];\n" ); */
/* Dump the tasks. */
for ( k = 0 ; k < s.count ; k++ ) {
int *d = (int *)&s.data[ s.tasks[k].data ];
printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
}
}
/**
* @brief Main function.
*/
int main ( int argc , char *argv[] ) {
int c, nr_threads;
int M = 4, N = 4;
/* Get the number of threads. */
#pragma omp parallel shared(nr_threads)
{
if ( omp_get_thread_num() == 0 )
nr_threads = omp_get_num_threads();
}
/* Parse the options */
while ( ( c = getopt( argc , argv , "m:n:k:t:" ) ) != -1 )
switch( c ) {
case 'm':
if ( sscanf( optarg , "%d" , &M ) != 1 )
error( "Error parsing dimension M." );
break;
case 'n':
if ( sscanf( optarg , "%d" , &N ) != 1 )
error( "Error parsing dimension M." );
break;
case 't':
if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
error( "Error parsing number of threads." );
omp_set_num_threads( nr_threads );
break;
case '?':
fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N]\n" , argv[0] );
fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
"matrix using nr_threads threads.\n" );
exit( EXIT_FAILURE );
}
/* Dump arguments. */
message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads." ,
32*M , 32*N , nr_threads );
test_qr( M , N , nr_threads );
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment