Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Q
QuickSched
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWIFT
QuickSched
Commits
d2c38d96
Commit
d2c38d96
authored
10 years ago
by
Aidan Chalk
Browse files
Options
Downloads
Patches
Plain Diff
Added a file that wasn't uploaded
parent
563c738e
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
examples/test_qr.cu
+1176
-0
1176 additions, 0 deletions
examples/test_qr.cu
with
1176 additions
and
0 deletions
examples/test_qr.cu
0 → 100644
+
1176
−
0
View file @
d2c38d96
/* Config parameters. */
#include
"../config.h"
/* Standard includes. */
#include
<stdio.h>
#include
<stdlib.h>
#include
<string.h>
#include
<unistd.h>
#include
<math.h>
/* Local includes. */
extern
"C"
{
#include
"quicksched.h"
}
#include
"cuda_queue.h"
enum
task_types
{
task_SGEQRF
,
task_SLARFT
,
task_STSQRF
,
task_SSSRFT
}
;
#define TID threadIdx.x
#define numthreads 128
#define tilesize 32
#define cuda_maxtasks 1000000
#define CO(x,y,ldm) (((y)*ldm) + (x))
#define WARPS 4
__device__
float
*
GPU_matrix
;
__device__
float
*
GPU_tau
;
__device__
int
cuda_m
;
__device__
int
cuda_n
;
/*
* @brief Performs a reduction sum on a vector, storing the result in v[0].
* @param sumVector SMEM The array to compute over.
* @param startN The number of elements in the vector.
*
* Computes a binary reduction to find the sum of the elements in the vector,
* storing the result in the first element of the vector.
* should be called where threadDim.x <= 1 warp
*/
__device__
void
reduceSum
(
volatile
float
*
sumVector
,
char
startN
)
{
/* Start with n = startN/2 */
char
n
=
startN
>>
1
;
while
(
n
>
0
)
{
/* Use first n threads to add up to 2n elements. */
if
(
TID
<
n
)
sumVector
[
TID
]
=
sumVector
[
TID
]
+
sumVector
[
TID
+
n
];
/* Divide n by 2 for next iteration. */
n
=
n
>>
1
;
}
}
__device__
inline
void
reduceSumMultiWarp
(
float
*
value
)
{
#if __CUDA_ARCH__ >= 300
#pragma unroll
for
(
int
mask
=
16
;
mask
>
0
;
mask
>>=
1
)
*
value
+=
__shfl_xor
((
float
)
*
value
,
mask
);
*
value
=
__shfl
((
float
)
*
value
,
0
);
#else
__shared__
float
stuff
[
32
*
WARPS
];
int
tid
=
TID
%
32
;
int
group
=
TID
/
32
;
char
n
=
32
>>
1
;
while
(
n
>
0
)
{
if
(
tid
<
n
)
stuff
[
tid
]
=
stuff
[
tid
]
+
stuff
[
tid
+
n
];
n
=
n
>>
1
;
}
__threadfence
();
*
value
=
stuff
[
group
*
32
];
//printf("Not supported.\n");
//asm("trap;");
#endif
}
/* Routines for the tiled QR decomposition.*/
/**
*
* @brief Computes the QR decomposition of a tile.
*
* @param cornerTile A pointer to the tile for which the decomposition is computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param k The value of k for the QR decomposition
* @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
*
* Note: 32 threads max.
*/
__device__
void
SGEQRF
(
float
*
restrict
cornerTile
,
int
tileSize
,
float
*
restrict
tauMatrix
,
int
k
,
int
tauNum
,
float
*
w
)
{
int
i
,
j
;
float
norm
=
0.0
,
sign
,
u1
,
tau
,
z
;
int
TID
=
threadID
%
32
;
int
set
=
threadID
/
32
;
/*Find the householder vector for each row. */
for
(
i
=
0
;
i
<
tileSize
;
i
++
)
{
norm
=
0.0
;
if
(
TID
>
=
i
)
w
[
TID
]
=
cornerTile
[
i
*
tilesize
+
TID
]
*
cornerTile
[
i
*
tilesize
+
TID
];
else
w
[
TID
]
=
0.0
;
reduceSumMultiWarp
(
&
w
[
TID
]
);
if
(
cornerTile
[
i
*
tilesize
+
i
]
>=
0.0
)
sign
=
-
1
;
else
sign
=
1
;
norm
=
sqrt
(
w
[
0
]);
if
(
TID
==
0
)
w
[
0
]
=
0.0
;
u1
=
cornerTile
[
i
*
tilesize
+
i
]
-
sign
*
norm
;
if
(
u1
!=
0.0
)
{
if
(
TID
>
i
)
w
[
TID
]
=
w
[
TID
]
/
u1
;
}
else
{
if
(
TID
>
i
)
w
[
TID
]
=
0.0
;
}
if
(
norm
!=
0.0
)
tau
=
-
sign
*
u1
/
norm
;
else
tau
=
0.0
;
/*Store the below diagonal vector */
if
(
TID
>
i
)
cornerTile
[
i
*
tileSize
+
TID
]
=
w
[
TID
];
if
(
TID
==
0
)
cornerTile
[
i
*
tileSize
+
i
]
=
sign
*
norm
;
if
(
TID
==
i
)
w
[
i
]
=
1.0
;
/* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
for
(
j
=
i
+
1
;
j
<
tileSize
;
j
++
)
{
/*Compute w'*A_j*/
if
(
TID
>=
i
)
z
=
cornerTile
[
j
*
tileSize
+
TID
]
*
w
[
TID
];
else
z
=
0
;
reduceSumMultiWarp
(
&
z
);
/* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
if
(
TID
>=
i
)
cornerTile
[
j
*
tileSize
+
TID
]
=
cornerTile
[
j
*
tileSize
+
TID
]
-
tau
*
w
[
TID
]
*
z
;
}
/* Store tau. We're on k*tileSize+ith row. kth column.*/
if
(
TID
==
0
)
tauMatrix
[(
k
*
tileSize
+
i
)
*
tauNum
+
k
]
=
tau
;
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile for which the householder is stored.
* @param rowTiles The tile to which the householder is applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
*
*
*/
void
SLARFT
(
float
*
cornerTile
,
float
*
rowTile
,
int
tileSize
,
int
jj
,
int
kk
,
float
*
tauMatrix
,
int
tauNum
,
float
*
w
)
{
int
i
,
j
,
n
;
float
z
;
float
w
;
int
TID
=
threadID
%
32
;
int
set
=
threadID
/
32
;
/* For each row in the corner Tile*/
for
(
i
=
0
;
i
<
tileSize
;
i
++
)
{
/*Get w for row i */
for
(
j
=
i
;
j
<
tileSize
;
j
++
)
{
w
[
j
]
=
cornerTile
[
i
*
tileSize
+
j
];
}
w
[
i
]
=
1.0
;
/* Apply to the row Tile */
for
(
j
=
0
;
j
<
tileSize
;
j
++
)
{
z
=
0.0
;
/* Below Diagonal!*/
/*Compute w'*A_j*/
for
(
n
=
i
;
n
<
tileSize
;
n
++
)
{
z
=
z
+
w
[
n
]
*
rowTile
[
j
*
tileSize
+
n
];
}
for
(
n
=
i
;
n
<
tileSize
;
n
++
)
{
rowTile
[
j
*
tileSize
+
n
]
=
rowTile
[
j
*
tileSize
+
n
]
-
tauMatrix
[(
kk
*
tileSize
+
i
)
*
tauNum
+
kk
]
*
w
[
n
]
*
z
;
}
}
}
}
/*
* @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array.
* @param matelem The TIDth element of the column to be used for calculation.
* @param hhVector SMEM The array used to calculate and store the vector.
* @param k The timestep at which this is called. i.e where the diagonal falls.
*
* Calculates a householder vector for the vector passed as elements into a shared array.
* Used only by SGEQRF_CUDA.
*/
__device__
void
calcHH
(
float
matelem
,
volatile
float
hhVector
[],
int
k
)
{
/* To store the value of 1/(v[k] + sign(v[k])*||v||). */
float
localdiv
;
/* Stores the sign of v[k] */
int
sign
;
/* Read vectors into shared memory from below the diagonal. */
if
(
TID
>=
k
)
hhVector
[
TID
]
=
matelem
;
/* Above the diagonal, set to 0. */
if
(
TID
<
k
)
hhVector
[
TID
]
=
0
;
/* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */
hhVector
[
TID
]
*=
hhVector
[
TID
];
/* Reduce to find sum of squares. */
reduceSum
(
hhVector
,
32
);
/* Set localdiv to equal ||v|| */
localdiv
=
sqrt
(
hhVector
[
0
]);
/* According to Householder algorithm in Golub & Van Loan. */
if
(
localdiv
!=
0.0
)
{
/* Load shared to communicate v[k] to all threads. */
hhVector
[
TID
]
=
matelem
;
/* Calculate the sign of v[k] locally. */
sign
=
hhVector
[
k
]
>=
0
?
1
:
-
1
;
/* Compute and store localdiv = (||v||)*sign(v[k]) */
localdiv
*=
sign
;
/* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */
localdiv
+=
hhVector
[
k
];
/* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */
localdiv
=
1.0
f
/
localdiv
;
}
/* If norm is 0, do not change v. */
else
localdiv
=
1.0
f
;
/* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */
if
(
TID
<
k
)
hhVector
[
TID
]
=
0.0
f
;
/* v[k]/v[k] = 1 */
if
(
TID
==
k
)
hhVector
[
TID
]
=
1.0
f
;
if
(
TID
>
k
)
hhVector
[
TID
]
=
matelem
*
localdiv
;
}
/*
* @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k].
* @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied.
* @param matTau The tau vector.
* @param workingVector SMEM The array used in the computation as a scratchpad.
* @param k Defines the submatrix.
* @param hhVectorelem Element of the cumulatively-stored householder vector to apply.
*
* Works by first calculating the kth tau,
* then proceeding to apply the vector to successive columns of the matrix A_j
* with the formula A_j = A_j - tau[k]*v*(v'A_j).
* The kth tau is then stored into memory.
*/
__device__
void
applyHH
(
volatile
float
blockCache
[],
float
*
matTau
,
volatile
float
workingVector
[],
int
k
,
float
hhVectorelem
)
{
float
ktau
,
/* Stores the value of ktau*v*v'*A_j */
z
;
int
j
;
float
reg
;
/* Read householder vector into shared memory to calculate ktau = 2/v'v */
reg
/*workingVector[TID]*/
=
hhVectorelem
;
/* Square the elements to start finding v'v */
reg
/*workingVector[TID]*/
*=
reg
/*workingVector[TID]*/
;
/* Perform a reduction to compute v'v and store the result in v[0]. */
//reduceSum(workingVector, 32);
reduceSumMultiWarp
(
&
reg
);
/* Finally compute ktau = 2/v'v */
ktau
=
2.0
/
/*workingVector[0]*/
reg
;
/* For the columns of the submatrix starting at (k,k). */
for
(
j
=
k
;
j
<
32
;
j
++
)
{
/* Fill the shared memory to calculate v'*A_j */
reg
/*workingVector[TID]*/
=
blockCache
[
TID
+
(
j
*
32
)]
*
hhVectorelem
;
/* Perform reduction to compute v'*A_j, storing the result in v[0] */
//reduceSum(workingVector, 32);
reduceSumMultiWarp
(
&
reg
);
/* Compute and store z = ktau*v[tid] */
z
=
hhVectorelem
*
ktau
;
/* Compute and store z = (v'A_j)*(ktau*v[tid]) */
z
*=
/*workingVector[0]*/
reg
;
/* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */
blockCache
[
TID
+
(
j
*
32
)]
-=
z
;
}
/* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */
if
(
TID
>
k
)
blockCache
[
TID
+
k
*
32
]
=
hhVectorelem
;
/* Store the kth tau. */
if
(
TID
==
0
)
matTau
[
k
]
=
ktau
;
}
__device__
void
runner_cuda_SGEQRF
(
int
i
,
int
j
,
int
k
,
volatile
float
workingVector
[],
volatile
float
blockCache
[]
)
{
int
kk
,
jj
,
refCache
;
refCache
=
TID
;
float
*
matrix
=
&
GPU_matrix
[
j
*
cuda_m
*
32
*
32
+
i
*
32
*
32
];
float
*
matTau
=
&
GPU_tau
[
j
*
cuda_m
*
32
+
i
*
32
];
for
(
jj
=
0
;
jj
<
32
;
jj
++
)
//load block into shared memory
{
blockCache
[
refCache
]
=
matrix
[
refCache
];
refCache
+=
32
;
}
for
(
kk
=
0
;
kk
<
32
;
kk
++
)
{
//calculate the kth hh vector from the kth column of the TIDth row of the matrix
calcHH
(
blockCache
[
TID
+
(
kk
*
32
)],
//(tid, k)
workingVector
,
kk
);
//calculate the application of the hhvector along row TID
applyHH
(
blockCache
,
matTau
,
workingVector
,
kk
,
workingVector
[
TID
]);
}
//copy row back
refCache
=
TID
;
for
(
jj
=
0
;
jj
<
32
;
jj
++
)
//unload block from shared memory
{
matrix
[
refCache
]
=
blockCache
[
refCache
];
refCache
+=
32
;
}
__threadfence
();
}
__device__
void
runner_cuda_SLARFT
(
int
i
,
int
j
,
int
k
,
volatile
float
tau
[],
volatile
float
blockCache
[]
)
{
int
jj
,
kk
,
ii
,
tid
=
TID
%
32
,
group
=
TID
/
32
;
__shared__
volatile
float
groupCache
[
32
*
4
];
float
*
blockV
=
&
GPU_matrix
[
k
*
cuda_m
*
32
*
32
+
k
*
32
*
32
];
float
*
blockA
=
&
GPU_matrix
[
j
*
cuda_m
*
32
*
32
+
i
*
32
*
32
];
float
*
blockTau
=
&
GPU_tau
[
i
*
cuda_m
*
32
+
i
*
32
];
// cacheCol is a column in the groupCache
volatile
float
*
cacheCol
=
groupCache
+
(
group
*
32
);
float
alpha
,
belem
;
__syncthreads
();
/* Load tau Vector */
if
(
TID
<
32
)
tau
[
TID
]
=
blockTau
[
TID
];
/* Load Vectors */
for
(
jj
=
group
;
jj
<
32
;
jj
+=
WARPS
)
{
if
(
tid
<
jj
)
blockCache
[
tid
+
(
jj
*
32
)]
=
0.0
f
;
else
if
(
tid
==
jj
)
blockCache
[
tid
+
(
jj
*
32
)]
=
1.0
f
;
else
if
(
tid
>
jj
)
blockCache
[
tid
+
(
jj
*
32
)]
=
blockV
[
tid
+
(
jj
*
32
)];
//__syncthreads();
}
__syncthreads
();
/* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */
for
(
jj
=
group
;
jj
<
32
;
jj
+=
WARPS
)
{
/* Read the column into registers. */
//if(group == 0)
//{
belem
=
blockA
[
tid
+
(
jj
*
32
)];
//__syncthreads();
if
(
jj
==
0
)
printf
(
"%.3f %i
\n
"
,
belem
,
TID
);
/* For each vector in block of vectors. */
for
(
kk
=
0
;
kk
<
32
;
kk
++
)
{
/* Compute alpha = v'*b_j */
cacheCol
[
tid
]
=
blockCache
[
tid
+
(
kk
*
32
)]
*
belem
;
if
(
group
==
0
&&
jj
==
0
&&
kk
==
0
)
printf
(
"%.3f %i %.3f %.3f
\n
"
,
belem
,
TID
,
blockCache
[
tid
+
kk
*
32
],
cacheCol
[
tid
]);
// __syncthreads();
if
(
tid
<
16
)
cacheCol
[
tid
]
+=
cacheCol
[
tid
+
16
];
// __syncthreads();
if
(
tid
<
8
)
cacheCol
[
tid
]
+=
cacheCol
[
tid
+
8
];
// __syncthreads();
alpha
=
cacheCol
[
0
];
for
(
ii
=
1
;
ii
<
8
;
ii
++
)
alpha
+=
cacheCol
[
ii
];
/* Compute alpha = tau * v_tid * alpha */
alpha
*=
tau
[
kk
];
if
(
TID
==
0
&&
kk
==
0
)
printf
(
"tau[kk] = %.3f
\n
"
,
tau
[
kk
]);
alpha
*=
blockCache
[
tid
+
(
kk
*
32
)];
if
(
jj
==
0
&&
kk
==
0
)
printf
(
"%.3f
\n
"
,
alpha
);
/* Compute belem -= alpha */
belem
-=
alpha
;
//__syncthreads();
}
blockA
[
tid
+
(
jj
*
32
)]
=
belem
;
//}
//__syncthreads();
}
__threadfence
();
}
__device__
void
applyOneHHVectD
(
float
*
topElem
,
float
*
lowElem
,
int
k
,
volatile
float
tau
[],
volatile
float
blockCache
[])
{
float
alpha
;
__shared__
volatile
float
workV
[
32
];
float
reg
;
int
tid
=
TID
%
32
;
/* Compute alpha = sum */
if
(
tid
==
k
)
reg
/*workV[TID]*/
=
*
topElem
;
if
(
tid
!=
k
)
reg
/*workV[TID]*/
=
0.0
f
;
reg
/*workV[TID]*/
+=
*
lowElem
*
blockCache
[
tid
+
(
k
*
32
)];
//reduceSum(workV, 32);
reduceSumMultiWarp
(
&
reg
);
alpha
=
reg
/*workV[0]*/
;
/* Multiply by tau */
alpha
*=
tau
[
k
];
/* Compute alpha *= a_tid,j*v_tid,k */
if
(
tid
==
k
)
*
topElem
-=
alpha
;
/* For lower element. */
alpha
*=
blockCache
[
tid
+
(
k
*
32
)];
*
lowElem
-=
alpha
;
}
__device__
void
calcDoubleHHWY
(
float
topElem
,
float
lowElem
,
int
k
,
volatile
float
blockCache
[])
{
/* Calculate v_k. */
int
sign
,
i
;
float
topSum
=
0
,
lowSum
=
0
,
alpha
;
__shared__
volatile
float
first
;
if
(
TID
==
k
)
first
=
topElem
;
topSum
=
first
*
first
;
/* Sum squares of V to compute norm, using column of BlockCache as working space. */
blockCache
[
TID
+
(
k
*
32
)]
=
lowElem
*
lowElem
;
for
(
i
=
0
;
i
<
32
;
i
++
)
lowSum
+=
blockCache
[
i
+
(
k
*
32
)];
alpha
=
sqrt
(
topSum
+
lowSum
);
sign
=
first
>=
0.
f
?
1.
f
:
-
1.
f
;
if
(
alpha
!=
0.0
f
)
{
/* Zeroth element is = first + sign*norm */
alpha
=
first
+
sign
*
alpha
;
alpha
=
1.0
f
/
alpha
;
}
else
alpha
=
1.0
f
;
//topElem *= alpha;
lowElem
*=
alpha
;
blockCache
[
TID
+
(
k
*
32
)]
=
lowElem
;
}
__device__
void
runner_cuda_STSQRF
(
int
i
,
int
j
,
int
k
,
volatile
float
tauVect
[],
volatile
float
blockCache
[])
{
/* Idea: for each column j of (AB)^T,
apply HH vectors 0...j-1,
compute new HH vector for column j,
store essential part in blockCache
Uses one block cache, one vector storage. */
float
*
blockA
=
&
GPU_matrix
[
k
*
cuda_m
*
32
*
32
+
k
*
32
*
32
];
float
*
blockB
=
&
GPU_matrix
[
j
*
cuda_m
*
32
*
32
+
i
*
32
*
32
];
float
*
blockTau
=
&
GPU_tau
[
j
*
cuda_m
*
32
+
i
*
32
];
// int j, k, i;
int
tid
=
TID
%
32
;
int
group
=
TID
/
32
;
float
topElem
,
lowElem
,
tau
;
for
(
j
=
0
;
j
<
32
;
j
++
)
{
if
(
TID
<
32
)
{
if
(
TID
<=
j
)
topElem
=
blockA
[
TID
+
(
j
*
32
)];
if
(
TID
>
j
)
topElem
=
0.
f
;
lowElem
=
blockB
[
TID
+
(
j
*
32
)];
//for(k = 0; k < j; k ++)
//{
/* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */
// applyOneHHVectD (&topElem, &lowElem,
// k,
// tauVect,
// blockCache);
//}
calcDoubleHHWY
(
topElem
,
lowElem
,
j
,
blockCache
);
/* Compute new tau = 2/v'v */
tau
=
1.0
f
;
for
(
i
=
0
;
i
<
32
;
i
++
)
tau
+=
blockCache
[
i
+
(
j
*
32
)]
*
blockCache
[
i
+
(
j
*
32
)];
tau
=
2.0
f
/
tau
;
if
(
TID
==
j
)
tauVect
[
j
]
=
tau
;
}
__syncthreads
();
/* Apply new vector to column. */
for
(
k
=
j
+
group
;
k
<
32
;
k
+=
WARPS
)
{
if
(
k
!=
j
)
{
if
(
tid
<=
k
)
topElem
=
blockA
[
tid
+
k
*
32
];
if
(
tid
>
k
)
topElem
=
0.
f
;
lowElem
=
blockB
[
tid
+
k
*
32
];
}
applyOneHHVectD
(
&
topElem
,
&
lowElem
,
j
,
tauVect
,
blockCache
);
if
(
tid
<=
k
)
blockA
[
tid
+
k
*
32
]
=
topElem
;
blockB
[
tid
+
k
*
32
]
=
lowElem
;
// __syncthreads();
}
__syncthreads
();
/* Write back */
//if(TID <= j)
// blockA[TID + (j*ldm)] = topElem;
}
/* Write back lower block, containing householder Vectors. */
if
(
TID
<
32
)
{
for
(
j
=
0
;
j
<
32
;
j
++
)
blockB
[
TID
+
(
j
*
32
)]
=
blockCache
[
TID
+
(
j
*
32
)];
blockTau
[
TID
]
=
tauVect
[
TID
];
}
__threadfence
();
}
__device__
void
runner_cuda_SSSRFT
(
int
i
,
int
j
,
int
k
,
volatile
float
tau
[],
volatile
float
blockCache
[]
)
{
__shared__
volatile
float
workV
[
WARPS
*
32
];
__syncthreads
();
float
aelem
,
belem
,
alpha
;
float
*
blockV
=
&
GPU_matrix
[
k
*
cuda_m
*
32
*
32
+
i
*
32
*
32
];
float
*
blockA
=
&
GPU_matrix
[
j
*
cuda_m
*
32
*
32
+
k
*
32
*
32
];
float
*
blockB
=
&
GPU_matrix
[
j
*
cuda_m
*
32
*
32
+
i
*
32
*
32
];
float
*
blockTau
=
&
GPU_tau
[
k
*
cuda_m
*
32
+
i
*
32
];
int
tid
,
group
,
refMat
,
refCache
;
tid
=
TID
%
32
;
group
=
TID
/
32
;
refMat
=
tid
+
group
*
32
;
refCache
=
tid
+
group
*
32
;
if
(
TID
<
32
)
tau
[
TID
]
=
blockTau
[
TID
];
__syncthreads
();
/* Load the essential HH vector block into shared cache. */
for
(
j
=
group
;
j
<
32
;
j
+=
WARPS
)
{
blockCache
[
refCache
]
=
blockV
[
refMat
];
refMat
+=
WARPS
*
32
;
refCache
+=
WARPS
*
32
;
__syncthreads
();
}
__syncthreads
();
/* For each column of the result. */
for
(
j
=
group
;
j
<
32
;
j
+=
WARPS
)
{
//if(tid == 0) printf("%d: column %d.\n", group, j);
/* Load the elements of the column to process. */
aelem
=
blockA
[
tid
+
(
j
*
32
)];
belem
=
blockB
[
tid
+
(
j
*
32
)];
/* Compute and apply b_j = b_j - tau*vv'b_j
for each Householder vector 1..32. */
for
(
k
=
0
;
k
<
32
;
k
++
)
{
/* Load the kth element into all threads. */
workV
[
tid
+
(
group
*
32
)]
=
aelem
;
/* Store this as alpha. */
alpha
=
workV
[
k
+
(
group
*
32
)];
/* Load components of v'b_j to sum. */
workV
[
tid
+
(
group
*
32
)]
=
belem
*
blockCache
[
tid
+
(
k
*
32
)];
if
(
tid
<
16
)
workV
[
tid
+
(
group
*
32
)]
+=
workV
[
16
+
tid
+
(
group
*
32
)];
if
(
tid
<
8
)
workV
[
tid
+
(
group
*
32
)]
+=
workV
[
8
+
tid
+
(
group
*
32
)];
/* Compute v'b_j */
for
(
i
=
0
;
i
<
8
;
i
++
)
alpha
+=
workV
[
i
+
(
group
*
32
)];
/* Multiply by kth tau. */
alpha
*=
tau
[
k
];
/* Compute b_j -= alpha*v
If kth thread in group, v is 1 at aelem. */
if
(
tid
==
k
)
aelem
-=
alpha
;
/* Compute b_j -= alpha * v for lower half. */
belem
-=
alpha
*
blockCache
[
tid
+
(
k
*
32
)];
__syncthreads
();
}
/* put the elements back. */
blockA
[
tid
+
(
j
*
32
)]
=
aelem
;
blockB
[
tid
+
(
j
*
32
)]
=
belem
;
//__syncthreads();
}
__syncthreads
();
}
__device__
void
runner
(
int
type
,
void
*
data
)
{
__shared__
volatile
float
blockCache
[(
32
*
32
)
+
128
];
volatile
float
*
workVector
;
workVector
=
blockCache
+
(
32
*
32
);
/* Decode the task data. */
int
*
idata
=
(
int
*
)
data
;
int
i
=
idata
[
0
],
j
=
idata
[
1
],
k
=
idata
[
2
];
// int z;
// double buff[ 2*K*K ];
/* Decode and execute the task. */
switch
(
type
)
{
case
task_SGEQRF
:
if
(
threadIdx
.
x
<
32
)
runner_cuda_SGEQRF
(
i
,
j
,
k
,
workVector
,
blockCache
);
if
(
threadIdx
.
x
==
0
){
printf
(
"SGEQRF: %i %i %i + %i
\n
"
,
i
,
j
,
k
,
clock
());
printf
(
"tau = "
);
for
(
k
=
0
;
k
<
cuda_m
*
cuda_n
*
32
;
k
++
)
{
printf
(
"%.3f "
,
GPU_tau
[
k
]);
}
printf
(
"
\n
"
);}
break
;
case
task_SLARFT
:
if
(
threadIdx
.
x
==
0
)
printf
(
"SLARFT: %i %i %i + %i
\n
"
,
i
,
j
,
k
,
clock
());
//runner_cuda_SLARFT(i, j, k, workVector, blockCache);
break
;
case
task_STSQRF
:
if
(
threadIdx
.
x
==
0
)
printf
(
"STSQRF: %i %i %i + %i
\n
"
,
i
,
j
,
k
,
clock
());
//runner_cuda_STSQRF(i, j, k, workVector, blockCache);
break
;
case
task_SSSRFT
:
if
(
threadIdx
.
x
==
0
)
printf
(
"SSSRFT: %i %i %i + %i
\n
"
,
i
,
j
,
k
,
clock
());
//runner_cuda_SSSRFT(i, j, k, workVector, blockCache);
break
;
default
:
asm
(
"trap;"
);
}
}
__device__
qsched_funtype
function
=
runner
;
__global__
void
Setup
()
{
printf
(
"%i
\n
"
,
function
);
}
float
*
generateMatrix
(
int
m
,
int
n
)
{
float
*
Matrix
;
cudaMallocHost
(
&
Matrix
,
sizeof
(
float
)
*
m
*
n
*
32
*
32
);
if
(
Matrix
==
NULL
)
error
(
"Failed to allocate Matrix"
);
int
i
,
j
;
memset
(
Matrix
,
0
,
sizeof
(
float
)
*
m
*
n
*
32
*
32
);
for
(
i
=
0
;
i
<
n
*
32
;
i
++
)
{
for
(
j
=
0
;
j
<
m
*
32
;
j
++
)
{
float
value
=
i
/
32
+
j
/
32
;
if
(
j
/
32
>
0
)
{
value
+=
2.
f
;
}
Matrix
[
i
*
m
*
32
+
j
]
=
value
;
}
}
return
Matrix
;
}
float
*
columnToTile
(
float
*
columnMatrix
,
int
size
,
int
m
,
int
n
)
{
float
*
TileMatrix
;
cudaMallocHost
(
&
TileMatrix
,
sizeof
(
float
)
*
size
);
if
(
TileMatrix
==
NULL
)
error
(
"failed to allocate TileMatrix"
);
int
rows
=
m
*
32
;
int
columns
=
n
*
32
;
int
i
,
j
,
k
,
l
;
for
(
i
=
0
;
i
<
n
;
i
++
)
{
for
(
j
=
0
;
j
<
m
;
j
++
)
{
float
*
tileStart
=
&
columnMatrix
[
i
*
m
*
32
*
32
+
j
*
32
];
float
*
tilePos
=
&
TileMatrix
[
i
*
m
*
32
*
32
+
j
*
32
*
32
];
for
(
k
=
0
;
k
<
32
;
k
++
)
{
tileStart
=
&
columnMatrix
[
i
*
m
*
32
*
32
+
k
*
m
*
32
+
j
*
32
];
for
(
l
=
0
;
l
<
32
;
l
++
)
{
tilePos
[
k
*
32
+
l
]
=
tileStart
[
l
];
}
}
}
}
return
TileMatrix
;
}
float
*
tileToColumn
(
float
*
tileMatrix
,
int
size
,
int
m
,
int
n
)
{
float
*
ColumnMatrix
;
ColumnMatrix
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
size
);
if
(
ColumnMatrix
==
NULL
)
error
(
"failed to allocate ColumnMatrix"
);
int
rows
=
m
*
32
;
int
columns
=
n
*
32
;
int
i
,
j
,
k
,
l
;
for
(
i
=
0
;
i
<
n
;
i
++
)
{
for
(
j
=
0
;
j
<
m
;
j
++
)
{
/* Tile on ith column is at i*m*32*32.*/
/* Tile on jth is at j*32*32 */
float
*
tile
=
&
tileMatrix
[
i
*
m
*
32
*
32
+
j
*
32
*
32
];
/* Column starts at same position as tile. */
/* Row j*32.*/
float
*
tilePos
=
&
ColumnMatrix
[
i
*
m
*
32
*
32
+
j
*
32
];
for
(
k
=
0
;
k
<
32
;
k
++
)
{
for
(
l
=
0
;
l
<
32
;
l
++
)
{
tilePos
[
l
]
=
tile
[
l
];
}
/* Next 32 elements are the position of the tile in the next column.*/
tile
=
&
tile
[
32
];
/* Move to the j*32th position in the next column. */
tilePos
=
&
tilePos
[
32
*
m
];
}
}
}
return
ColumnMatrix
;
}
float
*
createIdentity
(
int
m
,
int
n
)
{
float
*
Matrix
;
cudaMallocHost
(
&
Matrix
,
sizeof
(
float
)
*
m
*
n
*
32
*
32
);
if
(
Matrix
==
NULL
)
error
(
"Failed to allocate Matrix"
);
int
rows
=
m
*
32
;
int
columns
=
n
*
32
;
int
i
,
j
;
memset
(
Matrix
,
0
,
sizeof
(
float
)
*
m
*
n
*
32
*
32
);
for
(
i
=
0
;
i
<
n
*
32
;
i
++
)
{
for
(
j
=
0
;
j
<
m
*
32
;
j
++
)
{
if
(
i
==
j
)
{
Matrix
[
i
*
m
*
32
+
j
]
=
1.
f
;
}
else
{
Matrix
[
i
*
m
*
32
+
j
]
=
0.
f
;
}
}
}
return
Matrix
;
}
void
printMatrix
(
float
*
Matrix
,
int
m
,
int
n
)
{
int
i
,
j
;
for
(
i
=
0
;
i
<
m
*
32
;
i
++
)
{
for
(
j
=
0
;
j
<
n
*
32
;
j
++
)
{
printf
(
" %.1f "
,
Matrix
[
j
*
m
*
32
+
i
]);
}
printf
(
"
\n
"
);
}
}
/*void printTileMatrix(float* Matrix, int m, int n)
{
int i,j;
float *tempMatrix, *tempMatrix2;
for(i = 0; i < m*32; i++)
{
int tiled = i/32;
tempMatrix = &Matrix[tiled*32*32];
for(j = 0; j < n*32; j++)
{
int tile = j/32;
tempMatrix2 = &tempMatrix[tile*32*32*m + i%32];
printf(" %.1f ", tempMatrix2[j*32]);
//printf(" %.1f,%i ", tempMatrix2[j*32], &tempMatrix2[j*32]-Matrix);
}
printf("\n");
}
}*/
qsched_funtype
func
;
void
test_qr
(
int
m
,
int
n
,
int
K
,
int
nr_threads
,
int
runs
)
{
int
k
,
j
,
i
;
float
*
A
,
*
A_orig
,
*
tau
,
*
tau_device
,
*
device_array
,
*
temp_device_array
;
struct
qsched
s
;
qsched_task_t
*
tid
,
tid_new
;
qsched_res_t
*
rid
;
int
data
[
3
];
ticks
tic
,
toc_run
,
tot_setup
,
tot_run
=
0
;
/* Initialize the scheduler. */
qsched_init
(
&
s
,
1
,
qsched_flag_none
);
cudaDeviceReset
();
cudaSetDevice
(
0
);
Setup
<<<
1
,
1
>>>
();
if
(
cudaDeviceSynchronize
()
!=
cudaSuccess
)
error
(
"Setup Failed: %s"
,
cudaGetErrorString
(
cudaPeekAtLastError
()));
/* Allocate and fill the original matrix. */
if
(
cudaMallocHost
(
&
A
,
sizeof
(
float
)
*
m
*
n
*
K
*
K
)
!=
cudaSuccess
||
cudaMallocHost
(
&
tau
,
sizeof
(
float
)
*
m
*
n
*
K
)
!=
cudaSuccess
||
cudaMallocHost
(
&
A_orig
,
sizeof
(
float
)
*
m
*
n
*
K
*
K
)
!=
cudaSuccess
)
error
(
"Failed to allocate matrices."
);
cudaFreeHost
(
A_orig
);
// for ( k = 0 ; k < m * n * K * K ; k++ )
// A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
A_orig
=
generateMatrix
(
m
,
n
);
float
*
temp
=
columnToTile
(
A_orig
,
m
*
n
*
K
*
K
,
m
,
n
);
cudaFreeHost
(
A_orig
);
A_orig
=
temp
;
temp
=
tileToColumn
(
A_orig
,
m
*
n
*
K
*
K
,
m
,
n
);
// printMatrix(temp, m, n);
// printTileMatrix(A_orig, m, n);
memcpy
(
A
,
A_orig
,
sizeof
(
float
)
*
m
*
n
*
K
*
K
);
bzero
(
tau
,
sizeof
(
float
)
*
m
*
n
*
K
);
if
(
cudaMalloc
(
&
tau_device
,
sizeof
(
float
)
*
m
*
n
*
K
)
!=
cudaSuccess
)
error
(
"Failed to allocate tau on the device"
);
if
(
cudaMemcpy
(
tau_device
,
tau
,
sizeof
(
float
)
*
m
*
n
*
K
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy the tau data to the device."
);
if
(
cudaMemcpyToSymbol
(
GPU_tau
,
&
tau_device
,
sizeof
(
float
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy locks pointer to the device."
);
if
(
cudaMalloc
(
&
device_array
,
sizeof
(
float
)
*
m
*
n
*
K
*
K
)
!=
cudaSuccess
)
error
(
"Failed to allocate the matrix on the device"
);
if
(
cudaMemcpyToSymbol
(
GPU_matrix
,
&
device_array
,
sizeof
(
float
*
)
,
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy matrix pointer to the device"
);
/* Allocate and init the task ID and resource ID matrix. */
tic
=
getticks
();
/* Allocate and init the task ID and resource ID matrix. */
if
(
cudaMallocHost
(
&
tid
,
sizeof
(
qsched_task_t
)
*
m
*
n
)
!=
cudaSuccess
)
error
(
"Failed to allocate tid"
);
if
(
cudaMallocHost
(
&
rid
,
sizeof
(
qsched_task_t
)
*
m
*
n
)
!=
cudaSuccess
)
error
(
"Failed to allocate rid"
);
for
(
k
=
0
;
k
<
m
*
n
;
k
++
)
{
tid
[
k
]
=
qsched_task_none
;
if
(
cudaHostGetDevicePointer
(
&
temp_device_array
,
&
A
[
k
*
32
*
32
]
,
0
)
!=
cudaSuccess
)
error
(
"Failed to get device pointer to matrix"
);
rid
[
k
]
=
qsched_addres
(
&
s
,
qsched_owner_none
,
qsched_res_none
,
&
A
[
k
*
32
*
32
],
sizeof
(
float
)
*
32
*
32
,
device_array
+
k
*
32
*
32
);
}
/* Build the tasks. */
for
(
k
=
0
;
k
<
m
&&
k
<
n
;
k
++
)
{
/* Add kth corner task. */
data
[
0
]
=
k
;
data
[
1
]
=
k
;
data
[
2
]
=
k
;
tid_new
=
qsched_addtask
(
&
s
,
task_SGEQRF
,
task_flag_none
,
data
,
sizeof
(
int
)
*
3
,
2
);
qsched_addlock
(
&
s
,
tid_new
,
rid
[
k
*
m
+
k
]
);
if
(
tid
[
k
*
m
+
k
]
!=
-
1
)
qsched_addunlock
(
&
s
,
tid
[
k
*
m
+
k
]
,
tid_new
);
tid
[
k
*
m
+
k
]
=
tid_new
;
/* Add column tasks on kth row. */
for
(
j
=
k
+
1
;
j
<
n
;
j
++
)
{
data
[
0
]
=
k
;
data
[
1
]
=
j
;
data
[
2
]
=
k
;
tid_new
=
qsched_addtask
(
&
s
,
task_SLARFT
,
task_flag_none
,
data
,
sizeof
(
int
)
*
3
,
3
);
qsched_addlock
(
&
s
,
tid_new
,
rid
[
j
*
m
+
k
]
);
qsched_adduse
(
&
s
,
tid_new
,
rid
[
k
*
m
+
k
]
);
qsched_addunlock
(
&
s
,
tid
[
k
*
m
+
k
]
,
tid_new
);
if
(
tid
[
j
*
m
+
k
]
!=
-
1
)
qsched_addunlock
(
&
s
,
tid
[
j
*
m
+
k
]
,
tid_new
);
tid
[
j
*
m
+
k
]
=
tid_new
;
}
/* For each following row... */
for
(
i
=
k
+
1
;
i
<
m
;
i
++
)
{
/* Add the row tasks for the kth column. */
data
[
0
]
=
i
;
data
[
1
]
=
k
;
data
[
2
]
=
k
;
tid_new
=
qsched_addtask
(
&
s
,
task_STSQRF
,
task_flag_none
,
data
,
sizeof
(
int
)
*
3
,
3
);
qsched_addlock
(
&
s
,
tid_new
,
rid
[
k
*
m
+
i
]
);
qsched_adduse
(
&
s
,
tid_new
,
rid
[
k
*
m
+
k
]
);
qsched_addunlock
(
&
s
,
tid
[
k
*
m
+
(
i
-
1
)
]
,
tid_new
);
if
(
tid
[
k
*
m
+
i
]
!=
-
1
)
qsched_addunlock
(
&
s
,
tid
[
k
*
m
+
i
]
,
tid_new
);
tid
[
k
*
m
+
i
]
=
tid_new
;
/* Add the inner tasks. */
for
(
j
=
k
+
1
;
j
<
n
;
j
++
)
{
data
[
0
]
=
i
;
data
[
1
]
=
j
;
data
[
2
]
=
k
;
tid_new
=
qsched_addtask
(
&
s
,
task_SSSRFT
,
task_flag_none
,
data
,
sizeof
(
int
)
*
3
,
5
);
qsched_addlock
(
&
s
,
tid_new
,
rid
[
j
*
m
+
i
]
);
qsched_adduse
(
&
s
,
tid_new
,
rid
[
k
*
m
+
i
]
);
qsched_adduse
(
&
s
,
tid_new
,
rid
[
j
*
m
+
k
]
);
qsched_addunlock
(
&
s
,
tid
[
k
*
m
+
i
]
,
tid_new
);
qsched_addunlock
(
&
s
,
tid
[
j
*
m
+
k
]
,
tid_new
);
if
(
tid
[
j
*
m
+
i
]
!=
-
1
)
qsched_addunlock
(
&
s
,
tid
[
j
*
m
+
i
]
,
tid_new
);
tid
[
j
*
m
+
i
]
=
tid_new
;
}
}
}
/* build the tasks. */
if
(
cudaMemcpyFromSymbol
(
&
func
,
function
,
sizeof
(
qsched_funtype
)
)
!=
cudaSuccess
)
error
(
"Failed to copy function pointer from device"
);
if
(
cudaMemcpyToSymbol
(
cuda_m
,
&
m
,
sizeof
(
int
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy m to the device"
);
if
(
cudaMemcpyToSymbol
(
cuda_n
,
&
n
,
sizeof
(
int
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy n to the device"
);
qsched_run_CUDA
(
&
s
,
func
);
cudaMemcpy
(
A
,
device_array
,
sizeof
(
float
)
*
m
*
n
*
K
*
K
,
cudaMemcpyHostToDevice
);
A
=
tileToColumn
(
A
,
m
*
n
*
K
*
K
,
m
,
n
);
printMatrix
(
A
,
m
,
n
);
// printTileMatrix(A, m , n);
}
void
test
()
{
int
m
=
2
,
n
=
2
;
float
*
Matrix
=
createIdentity
(
m
,
n
);
//printMatrix(Matrix, m, n);
float
*
MatrixTile
=
columnToTile
(
Matrix
,
m
*
n
*
32
*
32
,
m
,
n
);
printMatrix
(
&
MatrixTile
[
0
],
1
,
1
);
printf
(
"
\n
\n
\n
"
);
printMatrix
(
&
MatrixTile
[
32
*
32
],
1
,
1
);
printf
(
"
\n
\n
\n
"
);
printMatrix
(
&
MatrixTile
[
64
*
32
],
1
,
1
);
printf
(
"
\n
\n
\n
"
);
printMatrix
(
&
MatrixTile
[
3
*
32
*
32
],
1
,
1
);
printf
(
"
\n
\n
\n
"
);
//printTileMatrix(MatrixTile, m, n);
free
(
Matrix
);
Matrix
=
tileToColumn
(
MatrixTile
,
m
*
n
*
32
*
32
,
m
,
n
);
printMatrix
(
Matrix
,
m
,
n
);
free
(
MatrixTile
);
free
(
Matrix
);
}
/**
* @brief Main function.
*/
int
main
(
int
argc
,
char
*
argv
[]
)
{
int
c
,
nr_threads
=
128
;
int
M
=
4
,
N
=
4
,
runs
=
1
,
K
=
32
;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
// {
// if ( omp_get_thread_num() == 0 )
// nr_threads = omp_get_num_threads();
// }
/* Parse the options */
while
(
(
c
=
getopt
(
argc
,
argv
,
"m:n:k:r:t:D"
)
)
!=
-
1
)
switch
(
c
)
{
case
'm'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
M
)
!=
1
)
error
(
"Error parsing dimension M."
);
break
;
case
'n'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
N
)
!=
1
)
error
(
"Error parsing dimension M."
);
break
;
case
'k'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
K
)
!=
1
)
error
(
"Error parsing tile size."
);
break
;
case
'r'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
runs
)
!=
1
)
error
(
"Error parsing number of runs."
);
break
;
case
't'
:
if
(
sscanf
(
optarg
,
"%d"
,
&
nr_threads
)
!=
1
)
error
(
"Error parsing number of threads."
);
//omp_set_num_threads( nr_threads );
break
;
case
'D'
:
test
();
exit
(
EXIT_SUCCESS
);
case
'?'
:
fprintf
(
stderr
,
"Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]
\n
"
,
argv
[
0
]
);
fprintf
(
stderr
,
"Computes the tiled QR decomposition of an MxN tiled
\n
"
"matrix using nr_threads threads.
\n
"
);
exit
(
EXIT_FAILURE
);
}
/* Dump arguments. */
message
(
"Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)."
,
32
*
M
,
32
*
N
,
nr_threads
,
runs
);
test_qr
(
M
,
N
,
K
,
nr_threads
,
runs
);
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment