diff --git a/examples/plot.py b/examples/plot.py index ce4c28bc95438f6817e6e6c67d70864c9cf0b8e7..2b26ddb804ad0188bdf8d4759ef220f288bb04be 100644 --- a/examples/plot.py +++ b/examples/plot.py @@ -39,17 +39,17 @@ print "Plotting..." # Read Quickshed accelerations data=loadtxt("particle_dump.out") id = data[:,0] -accx_e=data[:,4] -accy_e=data[:,5] -accz_e=data[:,6] +accx_e=data[:,5] +accy_e=data[:,6] +accz_e=data[:,7] -accx_bh=data[:,7] -accy_bh=data[:,8] -accz_bh=data[:,9] +accx_bh=data[:,8] +accy_bh=data[:,9] +accz_bh=data[:,10] -accx_new=data[:,10] -accy_new=data[:,11] -accz_new=data[:,12] +accx_new=data[:,11] +accy_new=data[:,12] +accz_new=data[:,13] # Sort accelerations rank = argsort(id) diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c index 79ee40a04a0271b2640994986acd8ebcaebee83e..deef47386e0d7b284885d14ac928843694769935 100644 --- a/examples/test_bh_mpi.c +++ b/examples/test_bh_mpi.c @@ -49,7 +49,7 @@ #define SANITY_CHECKS #define NO_COM_AS_TASK #define NO_COUNTERS -#define EXACT +#define NO_EXACT @@ -1287,7 +1287,7 @@ for(i = 0; i < s.count_ranks; i++) } //Need to clean up everything. - free(parts); +// free(parts); //TODO Clean up the cell-resource data. qsched_free(&s); @@ -1330,6 +1330,8 @@ int main(int argc, char *argv[]) { if (sscanf(optarg, "%d", &nr_threads) != 1) error("Error parsing number of threads."); omp_set_num_threads(nr_threads); + message("omp_get_max_threads() = %i\n", omp_get_max_threads()); + message("omp_get_num_procs() = %i\n", omp_get_num_procs()); break; case 'f': if (sscanf(optarg, "%s", &fileName[0]) != 1) diff --git a/examples/test_qr.c b/examples/test_qr.c index 556de9e01fb02fa1effd93bc731447bc0800160b..005e83e9c69beb48ad51f6728d37e5e1981635c3 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -587,7 +587,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { qsched_adduse(&s, tid_new, rid[k * m + i]); qsched_adduse(&s, tid_new, rid[j * m + k]); // qsched_addunlock(&s, tid[k * m + i], tid_new); - qsched_addunlock(&s, tid[j * m + i - 1], tid_new); + qsched_addunlock(&s, tid[j * m + (i - 1)], tid_new); if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); tid[j * m + i] = tid_new; diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index 673f1ee5ce0512d0c160264011999d25c750b0b2..7fed164ea6e7d9d3b9b933a532e2d25925f6e319 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -40,6 +40,7 @@ #include "quicksched.h" #include "res.h" +#define TASK_TIMERS #ifdef WITH_CBLAS /** @@ -466,14 +467,27 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { int k, j, i; double* A, *A_orig= NULL, *tau; struct qsched s; - qsched_task_t* tid = NULL, tid_new; + qsched_task_t* tid = NULL, tid_new = -1; qsched_res_t *rid = NULL, *tau_id = NULL; double **rids = NULL, **taus = NULL; int data[3]; +#ifdef TASK_TIMERS + long long int MPI_data[7]; +#else long long int MPI_data[6]; +#endif // ticks tic, toc_run, tot_setup, tot_run = 0; +#ifdef TASK_TIMERS + ticks tic, toc_run; +#endif char processor_name[MPI_MAX_PROCESSOR_NAME]; int name_len; + #ifdef TASK_TIMERS + long long int *task_start; + long long int *task_finish; + int *task_tids; + int *task_types; + #endif // Initialize the MPI environment int MpiThreadLevel; @@ -486,19 +500,23 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN ); enum task_types { - task_DGEQRF, + task_DGEQRF=1, task_DLARFT, task_DTSQRF, task_DSSRFT }; + printf("Task_DGEQRF = %i\n", task_DGEQRF); /* Runner function to pass to the scheduler. */ void runner(struct qsched *s, int type, void * data) { - + #ifdef TASK_TIMERS + tic = getticks(); + #endif /* Decode the task data. */ long long int* idata = (long long int*)data; // int i = idata[0], j = idata[1], k = idata[2]; long long int i, j , a , t; + long long int tid = -1; /* Need to pull the resources.*/ /* Decode and execute the task. */ @@ -509,6 +527,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) double *cornerTile = (double*)qsched_getresdata(s, i); double *tau = (double*)qsched_getresdata(s, j); DGEQRF(cornerTile, K, tau, k, m); + #ifdef TASK_TIMERS + tid = (idata[2] * m * m); + #endif break; case task_DLARFT: t = idata[2]; @@ -519,6 +540,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) tau = (double*)qsched_getresdata(s, t); DLARFT(cornerTile, lockedTile, K, j, k, tau, m); + #ifdef TASK_TIMERS + tid = (idata[3] * m * m) + (idata[4] * m); + #endif break; case task_DTSQRF: i = idata[0]; @@ -529,6 +553,9 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) tau = (double*)qsched_getresdata(s, t); DTSQRF(cornerTile, lockedTile, K, i, k, tau, m); + #ifdef TASK_TIMERS + tid = (idata[3] * m * m) + (idata[3] * m) + idata[4]; + #endif break; case task_DSSRFT: a = idata[2]; @@ -541,13 +568,33 @@ if(MpiThreadLevel != MPI_THREAD_MULTIPLE) tau = (double*)qsched_getresdata(s, t); DSSRFT(lockedTile1, usedTile, lockedTile2, K, i, j, k, tau, m); + #ifdef TASK_TIMERS + tid = (idata[4] * m * m) + (idata[6] * m) + (idata[5]); + #endif break; // default: // error("Unknown task type."); } + #ifdef TASK_TIMERS + if(type > 0){ + toc_run = getticks(); + task_start[tid] = tic; + task_finish[tid] = toc_run; + task_tids[tid] = omp_get_thread_num(); + task_types[tid] = type; + } + #endif } - qsched_init(&s, nr_threads, qsched_flag_noreown, MPI_COMM_WORLD); + qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD); + +#ifdef TASK_TIMERS +task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K); +printf("Created task_start of size %i", m*n*K*K); +task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K); +task_tids = (int*)calloc(sizeof(int), m*n*K*K); +task_types = (int*)calloc(sizeof(int), m*n*K*K); +#endif if(s.rank == 0){ /* Allocate and fill the original matrix. */ @@ -599,8 +646,14 @@ if(s.rank == 0) { data[2] = k; MPI_data[0] = rid[k*m+k]; MPI_data[1] = tau_id[k*m+k]; + #ifdef TASK_TIMERS + MPI_data[2] = k; + tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data, + sizeof(long long int) * 3, 300); + #else tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data, sizeof(long long int) * 2, 200); + #endif qsched_addlock(&s, tid_new, rid[k * m + k]); qsched_addlock(&s, tid_new, tau_id[k*m+k]); if(k == 0) @@ -618,8 +671,15 @@ if(s.rank == 0) { MPI_data[0] = rid[j*m+k]; MPI_data[1] = rid[k*m+k]; MPI_data[2] = tau_id[k*m+k]; + #ifdef TASK_TIMERS + MPI_data[3] = k; + MPI_data[4] = j; + tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data, + sizeof(long long int) * 5, 300); + #else tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data, sizeof(long long int) * 3, 300); + #endif if(k == 0) { memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); @@ -642,8 +702,15 @@ if(s.rank == 0) { MPI_data[0] = rid[k*m+i]; MPI_data[1] = rid[k*m+k]; MPI_data[2] = tau_id[k*m+i]; + #ifdef TASK_TIMERS + MPI_data[3] = k; + MPI_data[4] = i; + tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data, + sizeof(long long int) * 5, 300); + #else tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data, sizeof(long long int) * 3, 300); + #endif if(k == 0) { memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); @@ -654,7 +721,6 @@ if(s.rank == 0) { qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); tid[k * m + i] = tid_new; - /* Add the inner tasks. */ for (j = k + 1; j < n; j++) { data[0] = i; @@ -664,8 +730,16 @@ if(s.rank == 0) { MPI_data[1] = rid[k*m+i]; MPI_data[2] = rid[j*m+k]; MPI_data[3] = tau_id[k*m+i]; + #ifdef TASK_TIMERS + MPI_data[4] = k; + MPI_data[5] = i; + MPI_data[6] = j; + tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data, + sizeof(long long int) * 7, 300); + #else tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data, sizeof(long long int) * 4, 500); + #endif if(k == 0) { memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); @@ -692,6 +766,30 @@ qsched_run_MPI(&s, nr_threads, runner); //TODO Clean up the resource data. qsched_free(&s); +#ifdef TASK_TIMERS +FILE *file = NULL; + if(s.rank == 0) + file = fopen("Task_timers0.out", "w"); + else if (s.rank == 1) + file = fopen("Task_timers1.out", "w"); + else if (s.rank == 2) + file = fopen("Task_timers2.out", "w"); + else if (s.rank == 3) + file = fopen("Task_timers3.out", "w"); + + for(i = 0; i < m*n*K*K; i++) + { + if(task_types[i] > 0) + fprintf(file, "%i %i %lli %lli\n", task_types[i], task_tids[i],task_start[i], task_finish[i] ); + } + + + fclose(file); + free(task_types); + free(task_tids); + free(task_start); + free(task_finish); +#endif // Finalize the MPI environment. MPI_Finalize(); } @@ -705,6 +803,12 @@ int main(int argc, char* argv[]) { int c, nr_threads=1; int M = 4, N = 4, runs = 1, K = 32; +/* Get the number of threads. */ +//#pragma omp parallel shared(nr_threads) + //{ + // if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + //} + /* Parse the options */ while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) { case 'm': @@ -724,6 +828,8 @@ int main(int argc, char* argv[]) { if (sscanf(optarg, "%d", &nr_threads) != 1) error("Error parsing number of threads."); omp_set_num_threads(nr_threads); + message("omp_get_max_threads() = %i\n", omp_get_max_threads()); + message("omp_get_num_procs() = %i\n", omp_get_num_procs()); break; case '?': fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", diff --git a/src/qsched.c b/src/qsched.c index a9ba6fb0ccc90f938291435580c1f12199dde38f..a30ae00ecae6d489d27e9332721491466f6ec38d 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -3428,6 +3428,7 @@ printf("Rank[%i]: qsched_prepare_mpi took %lli (= %e) ticks\n", s->rank, { /* Local variable. */ struct task *t; + printf("Hello from thread %i on CPU %i\n", omp_get_thread_num(), sched_getcpu()); /* Get the ID of the current thread. */ int qid = omp_get_thread_num() % s->nr_queues;