Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Q
QuickSched
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWIFT
QuickSched
Commits
60922b1a
Commit
60922b1a
authored
10 years ago
by
aidan
Browse files
Options
Downloads
Patches
Plain Diff
Hopefully fixed the issues with heirarchical task generation
parent
045b696d
No related branches found
No related tags found
No related merge requests found
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
examples/test_bh_2.cu
+106
-21
106 additions, 21 deletions
examples/test_bh_2.cu
src/cuda_queue.cu
+112
-384
112 additions, 384 deletions
src/cuda_queue.cu
src/cuda_queue.cu.outdated
+150
-0
150 additions, 0 deletions
src/cuda_queue.cu.outdated
with
368 additions
and
405 deletions
examples/test_bh_2.cu
+
106
−
21
View file @
60922b1a
...
@@ -90,7 +90,9 @@ float4 *parts_a_m_host;
...
@@ -90,7 +90,9 @@ float4 *parts_a_m_host;
double2
*
com_xy_host
;
double2
*
com_xy_host
;
double
*
com_z_host
;
double
*
com_z_host
;
float
*
com_mass_host
;
float
*
com_mass_host
;
double2
*
parts_pos_xy_temp
;
double
*
parts_pos_z_temp
;
float4
*
parts_a_m_temp
;
...
@@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
...
@@ -113,9 +115,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
/* Loop over cell i.*/
/* Loop over cell i.*/
for
(
i
=
parts_i
+
threadIdx
.
x
;
i
<
parts_i
+
count_i
;
i
+=
blockDim
.
x
)
{
for
(
i
=
parts_i
+
threadIdx
.
x
;
i
<
parts_i
+
count_i
;
i
+=
blockDim
.
x
)
{
xi
[
0
]
=
parts_xy
[
i
].
x
;
xi
[
0
]
=
parts_
pos_
xy
[
i
].
x
;
xi
[
1
]
=
parts_xy
[
i
].
y
;
xi
[
1
]
=
parts_
pos_
xy
[
i
].
y
;
xi
[
2
]
=
parts_z
[
i
];
xi
[
2
]
=
parts_
pos_
z
[
i
];
for
(
k
=
0
;
k
<
3
;
k
++
)
{
for
(
k
=
0
;
k
<
3
;
k
++
)
{
ai
[
k
]
=
0.0
f
;
ai
[
k
]
=
0.0
f
;
}
}
...
@@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
...
@@ -154,9 +156,9 @@ __device__ __forceinline__ void iact_pair_direct(struct cell *ci, struct cell *c
}
}
/*Loop over cell j. */
/*Loop over cell j. */
for
(
i
=
parts_j
+
threadIdx
.
x
;
i
<
parts_j
+
count_j
;
i
+=
blockDim
.
x
)
{
for
(
i
=
parts_j
+
threadIdx
.
x
;
i
<
parts_j
+
count_j
;
i
+=
blockDim
.
x
)
{
xi
[
0
]
=
parts_xy
[
i
].
x
;
xi
[
0
]
=
parts_
pos_
xy
[
i
].
x
;
xi
[
1
]
=
parts_xy
[
i
].
y
;
xi
[
1
]
=
parts_
pos_
xy
[
i
].
y
;
xi
[
2
]
=
parts_z
[
i
];
xi
[
2
]
=
parts_
pos_
z
[
i
];
for
(
k
=
0
;
k
<
3
;
k
++
)
{
for
(
k
=
0
;
k
<
3
;
k
++
)
{
ai
[
k
]
=
0.0
f
;
ai
[
k
]
=
0.0
f
;
}
}
...
@@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
...
@@ -290,23 +292,40 @@ __device__ void iact_pair_pc(struct cell *ci, struct cell *cj, struct cell *leaf
struct
cell
*
cp
,
*
cps
;
struct
cell
*
cp
,
*
cps
;
if
(
leaf
->
split
)
{
printf
(
"Leaf split = 1, oh dear."
);
asm
(
"trap;"
);
}
if
(
ci
->
split
>
1
)
{
printf
(
"Cell %i had split > 1
\n
"
,
ci
-
cells
);
asm
(
"trap;"
);
}
if
(
cj
->
split
>
1
)
{
printf
(
"cell %i had split > 1
\n
"
,
cj
-
cells
);
asm
(
"trap;"
);
}
for
(
cp
=
&
cells
[
ci
->
firstchild
];
cp
!=
&
cells
[
ci
->
sibling
];
cp
=
&
cells
[
cp
->
sibling
])
{
for
(
cp
=
&
cells
[
ci
->
firstchild
];
cp
!=
&
cells
[
ci
->
sibling
];
cp
=
&
cells
[
cp
->
sibling
])
{
if
(
is_inside
(
leaf
,
cp
))
break
;
if
(
is_inside
(
leaf
,
cp
))
break
;
}
}
if
(
are_neighbours_different_size
(
cp
,
cj
))
{
if
(
are_neighbours_different_size
(
cp
,
cj
))
{
for
(
cps
=
&
cells
[
cj
->
firstchild
];
cps
!=
&
cells
[
c
i
->
sibling
];
cps
=
&
cells
[
cps
->
sibling
])
{
for
(
cps
=
&
cells
[
cj
->
firstchild
];
cps
!=
&
cells
[
c
j
->
sibling
];
cps
=
&
cells
[
cps
->
sibling
])
{
if
(
are_neighbours
(
cp
,
cps
))
{
if
(
are_neighbours
(
cp
,
cps
))
{
if
(
cp
->
split
&&
cps
->
split
)
{
if
(
cp
->
split
&&
cps
->
split
)
{
iact_pair_pc
(
cp
,
cps
,
leaf
);
iact_pair_pc
(
cp
,
cps
,
leaf
);
}
}
}
else
{
}
else
{
make_interact_pc
(
leaf
,
cps
);
make_interact_pc
(
leaf
,
cps
);
__syncthreads
();
}
}
}
}
}
}
__syncthreads
();
}
}
/**
/**
...
@@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
...
@@ -320,6 +339,17 @@ __device__ void iact_self_pc(struct cell *c, struct cell *leaf) {
struct
cell
*
cp
,
*
cps
;
struct
cell
*
cp
,
*
cps
;
if
(
leaf
->
split
)
{
printf
(
"Leaf split = 1, oh dear."
);
asm
(
"trap;"
);
}
if
(
c
->
split
>
1
)
{
printf
(
"Cell had split > 1
\n
"
);
asm
(
"trap;"
);
}
/* Find the subcell of c the leaf is in.*/
/* Find the subcell of c the leaf is in.*/
for
(
cp
=
&
cells
[
c
->
firstchild
];
cp
!=
&
cells
[
c
->
sibling
];
cp
=
&
cells
[
cp
->
sibling
])
{
for
(
cp
=
&
cells
[
c
->
firstchild
];
cp
!=
&
cells
[
c
->
sibling
];
cp
=
&
cells
[
cp
->
sibling
])
{
if
(
is_inside
(
leaf
,
cp
))
break
;
if
(
is_inside
(
leaf
,
cp
))
break
;
...
@@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) {
...
@@ -606,23 +636,23 @@ void cell_split(int c, struct qsched *s) {
{
{
if
(
cudaHostGetDevicePointer
(
&
temp_xy
,
&
parts_pos_xy_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_xy
,
&
parts_pos_xy_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
printf
(
"tempxy = %p
\n
"
,
temp_xy
);
//
printf("tempxy = %p\n", temp_xy);
cell_pool
[
c
].
res
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_xy
,
cell_pool
[
c
].
res
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_xy
,
sizeof
(
double2
)
*
cell_pool
[
c
].
count
,
parts_pos_xy
+
cell_pool
[
c
].
parts
);
sizeof
(
double2
)
*
cell_pool
[
c
].
count
,
parts_pos_xy
_temp
+
cell_pool
[
c
].
parts
);
}
}
if
(
cell_pool
[
c
].
resz
==
qsched_res_none
)
if
(
cell_pool
[
c
].
resz
==
qsched_res_none
)
{
{
if
(
cudaHostGetDevicePointer
(
&
temp_z
,
&
parts_pos_z_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_z
,
&
parts_pos_z_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
cell_pool
[
c
].
resz
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_z
,
cell_pool
[
c
].
resz
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_z
,
sizeof
(
double
)
*
cell_pool
[
c
].
count
,
parts_pos_z
+
cell_pool
[
c
].
parts
);
sizeof
(
double
)
*
cell_pool
[
c
].
count
,
parts_pos_z
_temp
+
cell_pool
[
c
].
parts
);
}
}
if
(
cell_pool
[
c
].
resm
==
qsched_res_none
)
if
(
cell_pool
[
c
].
resm
==
qsched_res_none
)
{
{
if
(
cudaHostGetDevicePointer
(
&
temp_a_m
,
&
parts_a_m_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_a_m
,
&
parts_a_m_host
[
cell_pool
[
c
].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
cell_pool
[
c
].
resm
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_a_m
,
cell_pool
[
c
].
resm
=
qsched_addres
(
s
,
qsched_owner_none
,
qsched_res_none
,
temp_a_m
,
sizeof
(
float4
)
*
cell_pool
[
c
].
count
,
parts_a_m
+
cell_pool
[
c
].
parts
);
sizeof
(
float4
)
*
cell_pool
[
c
].
count
,
parts_a_m
_temp
+
cell_pool
[
c
].
parts
);
}
}
// error("Cell has no resource");*///TODO
// error("Cell has no resource");*///TODO
...
@@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) {
...
@@ -731,15 +761,15 @@ void cell_split(int c, struct qsched *s) {
if
(
cudaHostGetDevicePointer
(
&
temp_xy
,
&
parts_pos_xy_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_xy
,
&
parts_pos_xy_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
cell_pool
[
progenitors
[
k
]].
res
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
res
,
temp_xy
,
cell_pool
[
progenitors
[
k
]].
res
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
res
,
temp_xy
,
sizeof
(
double2
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_pos_xy
+
cell_pool
[
progenitors
[
k
]].
parts
);
sizeof
(
double2
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_pos_xy
_temp
+
cell_pool
[
progenitors
[
k
]].
parts
);
if
(
cudaHostGetDevicePointer
(
&
temp_z
,
&
parts_pos_z_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_z
,
&
parts_pos_z_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
cell_pool
[
progenitors
[
k
]].
resz
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
resz
,
temp_z
,
cell_pool
[
progenitors
[
k
]].
resz
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
resz
,
temp_z
,
sizeof
(
double
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_pos_z
+
cell_pool
[
progenitors
[
k
]].
parts
);
sizeof
(
double
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_pos_z
_temp
+
cell_pool
[
progenitors
[
k
]].
parts
);
if
(
cudaHostGetDevicePointer
(
&
temp_a_m
,
&
parts_a_m_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
if
(
cudaHostGetDevicePointer
(
&
temp_a_m
,
&
parts_a_m_host
[
cell_pool
[
progenitors
[
k
]].
parts
],
0
)
!=
cudaSuccess
)
error
(
"Failed to get host device pointer."
);
error
(
"Failed to get host device pointer."
);
cell_pool
[
progenitors
[
k
]].
resm
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
resm
,
temp_a_m
,
cell_pool
[
progenitors
[
k
]].
resm
=
qsched_addres
(
s
,
qsched_owner_none
,
cell
->
resm
,
temp_a_m
,
sizeof
(
float4
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_a_m
+
cell_pool
[
progenitors
[
k
]].
parts
);
sizeof
(
float4
)
*
cell_pool
[
progenitors
[
k
]].
count
,
parts_a_m
_temp
+
cell_pool
[
progenitors
[
k
]].
parts
);
}
}
/* Find the first non-empty progenitor */
/* Find the first non-empty progenitor */
...
@@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) {
...
@@ -784,6 +814,9 @@ void cell_split(int c, struct qsched *s) {
int
data
[
2
]
=
{
root
,
c
};
int
data
[
2
]
=
{
root
,
c
};
int
tid
=
qsched_addtask
(
s
,
task_type_self_pc
,
task_flag_none
,
data
,
int
tid
=
qsched_addtask
(
s
,
task_type_self_pc
,
task_flag_none
,
data
,
2
*
sizeof
(
int
),
1
);
2
*
sizeof
(
int
),
1
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
root
].
res
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
root
].
resz
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
root
].
resm
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
res
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
res
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
resz
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
resz
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
resm
);
qsched_addlock
(
s
,
tid
,
cell_pool
[
c
].
resm
);
...
@@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) {
...
@@ -913,9 +946,7 @@ void test_bh(int N, int runs, char *fileName) {
struct
qsched
s
;
struct
qsched
s
;
ticks
tic
,
toc_run
,
tot_setup
=
0
,
tot_run
=
0
;
ticks
tic
,
toc_run
,
tot_setup
=
0
,
tot_run
=
0
;
int
countMultipoles
=
0
,
countPairs
=
0
,
countCoMs
=
0
;
int
countMultipoles
=
0
,
countPairs
=
0
,
countCoMs
=
0
;
double2
*
parts_pos_xy_temp
;
struct
cell
*
gpu_ptr_cells
;
double
*
parts_pos_z_temp
;
float4
*
parts_a_m_temp
;
cudaFree
(
0
);
cudaFree
(
0
);
if
(
cudaMemcpyFromSymbol
(
&
func
,
function
,
sizeof
(
qsched_funtype
)
)
!=
cudaSuccess
)
if
(
cudaMemcpyFromSymbol
(
&
func
,
function
,
sizeof
(
qsched_funtype
)
)
!=
cudaSuccess
)
...
@@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) {
...
@@ -934,10 +965,10 @@ void test_bh(int N, int runs, char *fileName) {
if
(
cudaMalloc
(
&
parts_pos_xy_temp
,
sizeof
(
double2
)
*
N
)
!=
cudaSuccess
)
if
(
cudaMalloc
(
&
parts_pos_xy_temp
,
sizeof
(
double2
)
*
N
)
!=
cudaSuccess
)
error
(
"Failed to allocate device parts array"
);
error
(
"Failed to allocate device parts array"
);
printf
(
"parts_pos_xy_temp = %p
\n
"
,
parts_pos_xy_temp
);
//
printf("parts_pos_xy_temp = %p\n", parts_pos_xy_temp);
if
(
cudaMemcpyToSymbol
(
parts_pos_xy
,
&
parts_pos_xy_temp
,
sizeof
(
double2
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
if
(
cudaMemcpyToSymbol
(
parts_pos_xy
,
&
parts_pos_xy_temp
,
sizeof
(
double2
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to set device symbol for parts array"
);
error
(
"Failed to set device symbol for parts array"
);
printf
(
"parts_pos_xy = %p
\n
"
,
parts_pos_xy
);
//
printf("parts_pos_xy = %p\n", parts_pos_xy);
if
(
cudaMalloc
(
&
parts_pos_z_temp
,
sizeof
(
double
)
*
N
)
!=
cudaSuccess
)
if
(
cudaMalloc
(
&
parts_pos_z_temp
,
sizeof
(
double
)
*
N
)
!=
cudaSuccess
)
error
(
"Failed to allocate device parts array"
);
error
(
"Failed to allocate device parts array"
);
if
(
cudaMemcpyToSymbol
(
parts_pos_z
,
&
parts_pos_z_temp
,
sizeof
(
double
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
if
(
cudaMemcpyToSymbol
(
parts_pos_z
,
&
parts_pos_z_temp
,
sizeof
(
double
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
...
@@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) {
...
@@ -1018,6 +1049,10 @@ void test_bh(int N, int runs, char *fileName) {
message
(
"Average number of parts per leaf is %lf."
,
((
double
)
N
)
/
((
double
)
nr_leaves
));
message
(
"Average number of parts per leaf is %lf."
,
((
double
)
N
)
/
((
double
)
nr_leaves
));
message
(
"Max number of parts in a leaf is %i, min number is %i"
,
maxparts
,
minparts
);
message
(
"Max number of parts in a leaf is %i, min number is %i"
,
maxparts
,
minparts
);
for
(
k
=
0
;
k
<
num_cells
;
k
++
)
if
(
cell_pool
[
k
].
split
>
1
)
printf
(
"Split > 1
\n
"
);
create_tasks
(
&
s
,
root
,
NULL
);
create_tasks
(
&
s
,
root
,
NULL
);
message
(
"total number of tasks: %i."
,
s
.
count
);
message
(
"total number of tasks: %i."
,
s
.
count
);
...
@@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) {
...
@@ -1033,6 +1068,56 @@ void test_bh(int N, int runs, char *fileName) {
parts_a_m_host
[
i
].
z
=
0.0
;
parts_a_m_host
[
i
].
z
=
0.0
;
}
}
/* Copy the cells to the device. */
if
(
cudaMalloc
(
&
gpu_ptr_cells
,
sizeof
(
struct
cell
)
*
used_cells
)
!=
cudaSuccess
)
error
(
"Failed to allocate cells on the GPU"
);
if
(
cudaMemcpy
(
gpu_ptr_cells
,
cell_pool
,
sizeof
(
struct
cell
)
*
used_cells
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy cells to the GPU"
);
if
(
cudaMemcpyToSymbol
(
cells
,
&
gpu_ptr_cells
,
sizeof
(
struct
cell
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy cell pointer to the GPU"
);
double2
*
com_temp
;
double
*
comz_temp
;
float
*
comm_temp
;
if
(
cudaMalloc
(
&
com_temp
,
sizeof
(
double2
)
*
used_cells
)
!=
cudaSuccess
)
error
(
"Failed to allocate com on the GPU"
);
if
(
cudaMemcpy
(
com_temp
,
com_xy_host
,
sizeof
(
double2
)
*
used_cells
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"failed to copy com to the GPU"
);
if
(
cudaMemcpyToSymbol
(
com_xy
,
&
com_temp
,
sizeof
(
double2
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy com pointer to the GPU"
);
if
(
cudaMalloc
(
&
comz_temp
,
sizeof
(
double
)
*
used_cells
)
!=
cudaSuccess
)
error
(
"Failed to allocate com on the GPU"
);
if
(
cudaMemcpy
(
comz_temp
,
com_z_host
,
sizeof
(
double
)
*
used_cells
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"failed to copy com to the GPU"
);
if
(
cudaMemcpyToSymbol
(
com_z
,
&
comz_temp
,
sizeof
(
double
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy com pointer to the GPU"
);
if
(
cudaMalloc
(
&
comm_temp
,
sizeof
(
float
)
*
used_cells
)
!=
cudaSuccess
)
error
(
"Failed to allocate com on the GPU"
);
if
(
cudaMemcpy
(
comm_temp
,
com_z_host
,
sizeof
(
float
)
*
used_cells
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"failed to copy com to the GPU"
);
if
(
cudaMemcpyToSymbol
(
com_mass
,
&
comm_temp
,
sizeof
(
float
*
),
0
,
cudaMemcpyHostToDevice
)
!=
cudaSuccess
)
error
(
"Failed to copy com pointer to the GPU"
);
/* for(i = 0; i < s.count; i++){
int *idata = (int*)&s.data[s.tasks[i].data];
if(s.tasks[i].type == task_type_self)
printf("Self task with data[0] = %i\n", idata[0]);
if(s.tasks[i].type == task_type_pair) {
printf("Pair task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
if(s.tasks[i].type == task_type_self_pc) {
printf("PC task with data[0] = %i and data[1] = %i\n", idata[0], idata[1]);
}
}*/
//Run code.
//Run code.
printf
(
"gpu_data = %p
\n
"
,
(
int
*
)
s
.
res
[
0
].
gpu_data
);
printf
(
"gpu_data = %p
\n
"
,
(
int
*
)
s
.
res
[
0
].
gpu_data
);
qsched_run_CUDA
(
&
s
,
func
);
qsched_run_CUDA
(
&
s
,
func
);
...
...
This diff is collapsed.
Click to expand it.
src/cuda_queue.cu
+
112
−
384
View file @
60922b1a
This diff is collapsed.
Click to expand it.
src/cuda_queue.cu.outdated
0 → 100644
+
150
−
0
View file @
60922b1a
/*
* Not used.
*/
void qsched_prepare_deps( struct qsched *s )
{
int **is_loaded;
int **parents;
int i, k, j, use, usek, usem;
int *num_parents;
is_loaded = (int**)malloc(sizeof(int*) * s->count);
parents = (int**)malloc(sizeof(int*) * s->count);
num_parents = (int*)malloc(sizeof(int) * s->count);
bzero(num_parents, sizeof(int)*s->count);
k = (sizeof(int)*s->count_res)/32 +1;
for(i = 0; i < s->count; i++)
{
is_loaded[i] = (int*)malloc(k);
bzero(is_loaded[i], k);
}
/* Is loaded[i][k] gives the set of k*32 resources for task i*/
/* Reset the waits to 0... */
for( k = 0; k < s->count; k++ )
{
s->tasks[k].wait = 0;
}
/* Run through the tasks and set the waits... */
for ( k = 0 ; k < s->count ; k++ ) {
struct task *t = &s->tasks[k];
if ( !( t->flags & task_flag_skip ) )
for ( j = 0 ; j < t->nr_unlocks ; j++ )
s->tasks[ t->unlocks[j] ].wait += 1;
}
/* Sort the tasks topologically. */
int *tid = (int *)malloc( sizeof(int) * s->count );
for ( j = 0 , k = 0 ; k < s->count ; k++ )
if ( s->tasks[k].wait == 0 ) {
tid[j] = k;
j += 1;
}
for ( k = 0 ; k < j ; k++ ) {
struct task *t = &s->tasks[ tid[k] ];
for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
tid[j] = t->unlocks[kk];
j += 1;
}
}
if ( k < s->count )
{
error( "Circular dependencies detected." );
}
/* Store the maximum number of parents of a task.*/
int max_parents = 0;
/* Compute the number of parents for each task */
for(i = s->count-1; i >= 0; i--)
{
for(j = 0; j < s->tasks[i].nr_unlocks; j++)
{
num_parents[s->tasks[i].unlocks[j]]++;
}
}
/* Allocate memory to store parents in.*/
for(i = 0; i < s->count; i++)
{
if(num_parents[i] > 0)
parents[i] = (int*)calloc(num_parents[i],sizeof(int));
else
parents[i] = NULL;
if(num_parents[i] > max_parents)
{
max_parents = num_parents[i];
}
num_parents[i] = 0;
}
/* This seems to be outdated and unused?*/
for(i = 0; i < s->count; i++)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
for(k = 0; k < s->tasks[i].nr_uses; k++)
{
use = s->tasks[i].uses[k];
usek = use >> 5; // use / 32;
usem = use & 31; // use % 32.
if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
{
qsched_addunlock(s, s->res[use].task , i ) ;
printf("We actually did this?!");
is_loaded[i][usek] |= (1 <<(31-usem));
}
}
for(k = 0; k < s->tasks[i].nr_unlocks; k++)
{
if(s->tasks[s->tasks[i].unlocks[k]].type == type_load ||
s->tasks[s->tasks[i].unlocks[k]].type == type_unload )
continue;
for(j = 0; j < s->count_res/32 +1; j++)
{
is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j];
}
parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i;
num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1;
}
}
max_parents = 0;
for(i = 0; i < s->count; i++)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
if(num_parents[i] > max_parents)
{
max_parents = num_parents[i];
}
bzero(is_loaded[i], k);
}
for(i = s->count-1; i >= 0; i--)
{
if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
continue;
for(k = 0; k < s->tasks[i].nr_uses; k++)
{
use = s->tasks[i].uses[k];
usek = use >> 5; // use / 32;
usem = use & 31; // use % 32.
if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
{
qsched_addunlock(s, i, s->res[use].utask );
is_loaded[i][usek] |= (1 << (31-usem));
}
}
for(k = 0; k < num_parents[i]; k++)
{
for(j = 0; j < s->count_res/32 +1; j++)
{
is_loaded[parents[i][k]][j] |= is_loaded[i][j];
}
}
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment