Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Q
QuickSched
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWIFT
QuickSched
Commits
898204b0
Commit
898204b0
authored
11 years ago
by
Pedro Gonnet
Browse files
Options
Downloads
Patches
Plain Diff
add ompss version of the qr test.
parent
8fd2e454
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
examples/test_qr_ompss.c
+394
-0
394 additions, 0 deletions
examples/test_qr_ompss.c
with
394 additions
and
0 deletions
examples/test_qr_ompss.c
0 → 100644
+
394
−
0
View file @
898204b0
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
/* Config parameters. */
#include
"../config.h"
/* Standard includes. */
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
#include
<unistd.h>
#include
<math.h>
#include
<omp.h>
/* LAPACKE header. */
#include
<lapacke.h>
#include
<cblas.h>
/* Local includes. */
#include
"quicksched.h"
/*
* Sam's routines for the tiled QR decomposition.
*/
/*
\brief Computes 2-norm of a vector \f$x\f$
Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
*/
double
do2norm
(
double
*
x
,
int
l
)
{
double
sum
=
0
,
norm
;
int
i
;
for
(
i
=
0
;
i
<
l
;
i
++
)
sum
+=
x
[
i
]
*
x
[
i
];
norm
=
sqrt
(
sum
);
return
norm
;
}
/**
* \brief Computes a Householder reflector from a pair of vectors from coupled blocks
*
* Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
* There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
*
* \param topDiag The only non-zero element of the incoming vector in the top block
* \param ma The number of elements in the top vector
* \param xb Pointer to the lower vector
* \param l The number of elements in the whole vector
* \param vk A pointer to a pre-allocated array to store the householder vector of size l
*
* \returns void
*/
void
calcvkDouble
(
double
topDiag
,
int
ma
,
double
*
xb
,
int
l
,
double
*
vk
)
{
int
sign
,
i
;
double
norm
,
div
;
//same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
sign
=
topDiag
>=
0
.
0
?
1
:
-
1
;
vk
[
0
]
=
topDiag
;
//use vk[0] as beta
for
(
i
=
1
;
i
<
ma
;
i
++
)
vk
[
i
]
=
0
;
for
(;
i
<
l
;
i
++
)
vk
[
i
]
=
xb
[
i
-
ma
];
norm
=
do2norm
(
vk
,
l
);
vk
[
0
]
+=
norm
*
sign
;
if
(
norm
!=
0
.
0
)
{
div
=
1
/
vk
[
0
];
for
(
i
=
1
;
i
<
l
;
i
++
)
vk
[
i
]
*=
div
;
}
}
void
updateDoubleQ_WY
(
double
*
blockA
,
double
*
blockB
,
double
*
blockTau
,
int
k
,
int
ma
,
int
mb
,
int
n
,
int
ldm
,
double
*
hhVector
)
//bottom, essential part.
{
int
i
,
j
;
double
tau
=
1
.
0
,
beta
;
/* Compute tau = 2/v'v */
for
(
i
=
0
;
i
<
mb
;
i
++
)
tau
+=
hhVector
[
i
]
*
hhVector
[
i
];
tau
=
2
/
tau
;
for
(
j
=
k
;
j
<
n
;
j
++
)
{
/* Compute v'*b_j */
beta
=
blockA
[(
j
*
ldm
)
+
k
];
/* Then for lower half */
for
(
i
=
0
;
i
<
mb
;
i
++
)
beta
+=
blockB
[(
j
*
ldm
)
+
i
]
*
hhVector
[
i
];
beta
*=
tau
;
/* Compute b_j = b_j - beta*v_k */
blockA
[(
j
*
ldm
)
+
k
]
-=
beta
;
for
(
i
=
0
;
i
<
mb
;
i
++
)
blockB
[(
j
*
ldm
)
+
i
]
-=
beta
*
hhVector
[
i
];
}
/* Insert vector below diagonal. */
for
(
i
=
0
;
i
<
mb
;
i
++
)
blockB
[(
k
*
ldm
)
+
i
]
=
hhVector
[
i
];
blockTau
[
k
]
=
tau
;
}
#pragma omp task in( blockA[0] ) inout( blockB[0] ) in( blockTau[0] )
void
DTSQRF
(
double
*
blockA
,
double
*
blockB
,
double
*
blockTau
,
int
ma
,
int
mb
,
int
n
,
int
ldm
)
{
int
k
;
double
*
xVectA
,
*
xVectB
;
double
hhVector
[
2
*
ma
];
xVectA
=
blockA
;
xVectB
=
blockB
;
for
(
k
=
0
;
k
<
n
;
k
++
)
{
//vk = sign(x[1])||x||_2e1 + x
//vk = vk/vk[0]
calcvkDouble
(
xVectA
[
0
],
ma
-
k
,
xVectB
,
(
ma
+
mb
)
-
k
,
hhVector
);
//returns essential
//matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
//update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
updateDoubleQ_WY
(
blockA
,
blockB
,
blockTau
,
k
,
ma
,
mb
,
n
,
ldm
,
hhVector
+
ma
-
k
);
xVectA
+=
ldm
+
1
;
xVectB
+=
ldm
;
}
}
#pragma omp task in( blockV[0] ) in( blockB[0] ) inout( blockA[0] ) in( blockTau[0] )
void
DSSRFT
(
double
*
blockV
,
double
*
blockA
,
double
*
blockB
,
double
*
blockTau
,
int
b
,
int
n
,
int
ldm
)
{
int
i
,
j
,
k
;
double
tau
,
beta
;
/* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
and for each householder vector v of blockV */
/* For each column of B */
for
(
j
=
0
;
j
<
n
;
j
++
)
{
/* For each householder vector. */
for
(
k
=
0
;
k
<
n
;
k
++
)
{
/* tau = 2/v'v, computed earlier, stored in T(k,k). */
tau
=
blockTau
[
k
];
/* Compute beta = v_k'b_j. */
/* v_k is >0 (=1) only at position k in top half. */
beta
=
blockA
[(
j
*
ldm
)
+
k
];
/* For lower portion of v_k, aligning with the lower block */
for
(
i
=
0
;
i
<
b
;
i
++
)
beta
+=
blockB
[(
j
*
ldm
)
+
i
]
*
blockV
[(
k
*
ldm
)
+
i
];
beta
*=
tau
;
/* Compute b_j = b_j - beta * v */
/* v_k = 1 at (k) in top half again */
blockA
[(
j
*
ldm
)
+
k
]
-=
beta
;
/* Apply to bottom block. */
for
(
i
=
0
;
i
<
b
;
i
++
)
blockB
[(
j
*
ldm
)
+
i
]
-=
beta
*
blockV
[(
k
*
ldm
)
+
i
];
}
}
}
/**
* @breif Wrapper to get the dependencies right.
*/
#pragma omp task inout( a[0] ) inout( tau[0] )
void
DGEQRF
(
int
matrix_order
,
lapack_int
m
,
lapack_int
n
,
double
*
a
,
lapack_int
lda
,
double
*
tau
)
{
LAPACKE_dgeqrf
(
matrix_order
,
m
,
n
,
a
,
lda
,
tau
);
}
/**
* @breif Wrapper to get the dependencies right.
*/
#pragma omp task inout( v[0] ) in( tau[0] ) inout( t[0] )
void
DLARFT
(
int
matrix_order
,
char
direct
,
char
storev
,
lapack_int
n
,
lapack_int
k
,
const
double
*
v
,
lapack_int
ldv
,
const
double
*
tau
,
double
*
t
,
lapack_int
ldt
)
{
LAPACKE_dlarft_work
(
matrix_order
,
direct
,
storev
,
n
,
k
,
v
,
ldv
,
tau
,
t
,
ldt
);
}
/**
* @brief Computed a tiled QR factorization using QuickSched.
*
* @param m Number of tile rows.
* @param n Number of tile columns.
* @param nr_threads Number of threads to use.
*/
void
test_qr
(
int
m
,
int
n
,
int
K
,
int
nr_threads
,
int
runs
)
{
int
k
,
j
,
i
,
r
;
double
*
A
,
*
A_orig
,
*
tau
;
ticks
tic
,
toc_run
,
tot_setup
,
tot_run
=
0
;
/* Allocate and fill the original matrix. */
if
(
(
A
=
(
double
*
)
malloc
(
sizeof
(
double
)
*
m
*
n
*
K
*
K
)
)
==
NULL
||
(
tau
=
(
double
*
)
malloc
(
sizeof
(
double
)
*
m
*
n
*
K
)
)
==
NULL
||
(
A_orig
=
(
double
*
)
malloc
(
sizeof
(
double
)
*
m
*
n
*
K
*
K
)
)
==
NULL
)
error
(
"Failed to allocate matrices."
);
for
(
k
=
0
;
k
<
m
*
n
*
K
*
K
;
k
++
)
A_orig
[
k
]
=
2
*
((
double
)
rand
())
/
RAND_MAX
-
1
.
0
;
memcpy
(
A
,
A_orig
,
sizeof
(
double
)
*
m
*
n
*
K
*
K
);
bzero
(
tau
,
sizeof
(
double
)
*
m
*
n
*
K
);
/* Dump A_orig. */
/* message( "A_orig = [" );
for ( k = 0 ; k < m*K ; k++ ) {
for ( j = 0 ; j < n*K ; j++ )
printf( "%.3f " , A_orig[ j*m*K + k ] );
printf( "\n" );
}
printf( "];\n" ); */
/* Loop over the number of runs. */
for
(
r
=
0
;
r
<
runs
;
r
++
)
{
/* Start the clock. */
tic
=
getticks
();
/* Launch the tasks. */
for
(
k
=
0
;
k
<
m
&&
k
<
n
;
k
++
)
{
/* Add kth corner task. */
DGEQRF
(
LAPACK_COL_MAJOR
,
K
,
K
,
&
A
[
k
*
m
*
K
*
K
+
k
*
K
]
,
m
*
K
,
&
tau
[
k
*
m
*
K
+
k
*
K
]
);
/* Add column tasks on kth row. */
for
(
j
=
k
+
1
;
j
<
n
;
j
++
)
{
DLARFT
(
LAPACK_COL_MAJOR
,
'F'
,
'C'
,
K
,
K
,
&
A
[
k
*
m
*
K
*
K
+
k
*
K
]
,
m
*
K
,
&
tau
[
k
*
m
*
K
+
k
*
K
]
,
&
A
[
j
*
m
*
K
*
K
+
k
*
K
]
,
m
*
K
);
}
/* For each following row... */
for
(
i
=
k
+
1
;
i
<
m
;
i
++
)
{
/* Add the row taks for the kth column. */
DTSQRF
(
&
A
[
k
*
m
*
K
*
K
+
k
*
K
]
,
&
A
[
k
*
m
*
K
*
K
+
i
*
K
]
,
&
tau
[
k
*
m
*
K
+
i
*
K
]
,
K
,
K
,
K
,
K
*
m
);
/* Add the inner tasks. */
for
(
j
=
k
+
1
;
j
<
n
;
j
++
)
{
DSSRFT
(
&
A
[
k
*
m
*
K
+
i
*
K
]
,
&
A
[
j
*
m
*
K
*
K
+
k
*
K
]
,
&
A
[
j
*
m
*
K
*
K
+
i
*
K
]
,
&
tau
[
k
*
m
*
K
+
i
*
K
]
,
K
,
K
,
K
*
m
);
}
}
}
/* build the tasks. */
/* Collect timers. */
toc_run
=
getticks
();
message
(
"%ith run took %lli ticks..."
,
r
,
toc_run
-
tic
);
tot_run
+=
toc_run
-
tic
;
}
/* Dump the costs. */
message
(
"costs: setup=%lli ticks, run=%lli ticks."
,
tot_setup
,
tot_run
/
runs
);
}
/**
* @brief Main function.
*/
int
main
(
int
argc
,
char
*
argv
[]
)
{
int
c
,
nr_threads
;
int
M
=
4
,
N
=
4
,
runs
=
1
,
K
=
32
;
/* Get the number of threads. */
#pragma omp parallel shared(nr_threads)
{
if
(
omp_get_thread_num
()
==
0
)
nr_threads
=
omp_get_num_threads
();
}
/* Parse the options */
while
(
(
c
=
getopt
(
argc
,
argv
,
"m:n:k:r:t:"
)
)
!=
-
1
)
switch
(
c
)
{
case
'm'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
M
)
!=
1
)
error
(
"Error parsing dimension M."
);
break
;
case
'n'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
N
)
!=
1
)
error
(
"Error parsing dimension M."
);
break
;
case
'k'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
K
)
!=
1
)
error
(
"Error parsing tile size."
);
break
;
case
'r'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
runs
)
!=
1
)
error
(
"Error parsing number of runs."
);
break
;
case
't'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
nr_threads
)
!=
1
)
error
(
"Error parsing number of threads."
);
omp_set_num_threads
(
nr_threads
);
break
;
case
'?'
:
fprintf
(
stderr
,
"Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]
\n
"
,
argv
[
0
]
);
fprintf
(
stderr
,
"Computes the tiled QR decomposition of an MxN tiled
\n
"
"matrix using nr_threads threads.
\n
"
);
exit
(
EXIT_FAILURE
);
}
/* Dump arguments. */
message
(
"Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)."
,
32
*
M
,
32
*
N
,
nr_threads
,
runs
);
test_qr
(
M
,
N
,
K
,
nr_threads
,
runs
);
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment