Skip to content
Snippets Groups Projects
Select Git revision
  • 5b59263bc1e1053579ebda9adc77eb5cc1209dea
  • master default protected
  • MPI_qsched
  • resource_reuse
  • task_timers
  • openMP_locks
  • fortran_bindings
  • fortran_with_timers
  • paper_updates
  • aidan
  • ompss_fixes
  • paper_fixes
  • FMM
  • autotools-update
  • paper_simulated_memcpy
  • gpu_friendly_bh
  • unbiased_gravity
  • test_bh_2
  • cleanup
  • merge_from_svn
20 results

test_qr_mpi.c

Blame
  • user avatar
    d74ksy authored
    Added some stuff for the qr and bh testcases as diagnostics. qr now has task timers output. Bug in mpirun on gtx690 means i can only run with 1 core inside the mpirun wrapper. There is probably a bug in the QR decomposition as it scales far too well...
    5b59263b
    History
    test_qr_mpi.c 26.45 KiB
    /*******************************************************************************
     * This file is part of QuickSched.
     * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
     *
     * This program is free software: you can redistribute it and/or modify
     * it under the terms of the GNU Lesser General Public License as published
     * by the Free Software Foundation, either version 3 of the License, or
     * (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU Lesser General Public License
     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
     *
    * *****************************************************************************/
    /* Config parameters. */
    #include "../config.h"
    
    /* Standard includes. */
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>
    #include <math.h>
    #include <float.h>
    #include <limits.h>
    #include <omp.h>
    #include <fenv.h>
    #include <mpi.h>
    
    
    #ifdef WITH_CLBAS
    #include <cblas.h>
    #endif
    
    /* Local includes. */
    #include "quicksched.h"
    #include "res.h"
    
    #define TASK_TIMERS
    
    #ifdef WITH_CBLAS
    /**
     * Takes a column major matrix, NOT tile major. size is length of a side of the
     * matrix. Only works for square matrices.
     * This function is simply for validation and is implemented naively as we know
     * of no implementation to retrieve Q from the tiled QR.
     */
    double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
      double* Q = malloc(sizeof(double) * size * size);
      double* Qtemp = malloc(sizeof(double) * size * size);
      double* w = malloc(sizeof(double) * size);
      double* ww = malloc(sizeof(double) * size * size);
      double* temp = malloc(sizeof(double) * size * size);
      int i, k, l, j, n;
      bzero(Q, sizeof(double) * size * size);
      bzero(Qtemp, sizeof(double) * size * size);
      bzero(ww, sizeof(double) * size * size);
      for (i = 0; i < size; i++) {
        Q[i * size + i] = 1.0;
      }
      int numcoltile = size / tilesize;
      int numrowtile = size / tilesize;
      for (k = 0; k < numrowtile; k++) {
        for (l = 0; l < tilesize; l++) {
          bzero(Qtemp, sizeof(double) * size * size);
          for (i = 0; i < size; i++) {
            Qtemp[i * size + i] = 1.0;
          }
    
          for (i = k; i < numcoltile; i++) {
            bzero(w, sizeof(double) * size);
    
            for (j = 0; j < tilesize; j++) {
              w[i * tilesize + j] =
                  HR[(k * tilesize + l) * size + i * tilesize + j];
            }
            w[k * tilesize + l] = 1.0;
            if (k * tilesize + l > i * tilesize) {
              for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
            }
    
            /* Compute (I - tau*w*w')' */
            for (j = 0; j < size; j++) {
              for (n = 0; n < size; n++) {
                if (j != n)
                  ww[n * size + j] =
                      -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
                else
                  ww[n * size + j] =
                      1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
              }
            }
    
            /* Qtemp = Qtemp * (I-tau*w*w')' */
            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                        1.0, Qtemp, size, ww, size, 0.0, temp, size);
            double* b = Qtemp;
            Qtemp = temp;
            temp = b;
          }
          cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                      1.0, Q, size, Qtemp, size, 0.0, temp, size);
          double* b = Q;
          Q = temp;
          temp = b;
        }
      }
    
      free(Qtemp);
      free(w);
      free(ww);
      free(temp);
      return Q;
    }
    
    #endif
    
    double* getR(double* HR, int size) {
      double* R = malloc(sizeof(double) * size * size);
      int i, j;
      bzero(R, sizeof(double) * size * size);
      for (i = 0; i < size; i++) {
        for (j = 0; j <= i; j++) {
          R[i * size + j] = HR[i * size + j];
        }
      }
      return R;
    }
    
    void printMatrix(double* Matrix, int m, int n, int tilesize) {
      int i, j;
    
      for (i = 0; i < m * tilesize; i++) {
        for (j = 0; j < n * tilesize; j++) {
          printf(" %.3f ", Matrix[j * m * tilesize + i]);
        }
        printf("\n");
      }
    }
    
    double* columnToTile(double* columnMatrix, int size, int m, int n,
                         int tilesize) {
      double* TileMatrix;
      TileMatrix = malloc(sizeof(double) * size);
      if (TileMatrix == NULL) error("failed to allocate TileMatrix");
      int i, j, k, l;
    
      for (i = 0; i < n; i++) {
        for (j = 0; j < m; j++) {
          double* tileStart =
              &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
          double* tilePos =
              &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
          for (k = 0; k < tilesize; k++) {
            tileStart = &columnMatrix[i * m * tilesize * tilesize +
                                      k * m * tilesize + j * tilesize];
            for (l = 0; l < tilesize; l++) {
              tilePos[k * tilesize + l] = tileStart[l];
            }
          }
        }
      }
    
      return TileMatrix;
    }
    
    double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
      double* ColumnMatrix;
      ColumnMatrix = (double*)malloc(sizeof(double) * size);
      if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
      int i, j, k, l;
      for (i = 0; i < n; i++) {
        for (j = 0; j < m; j++) {
          /* Tile on ith column is at i*m*32*32.*/
          /* Tile on jth is at j*32*32 */
          double* tile =
              &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
          /* Column starts at same position as tile. */
          /* Row j*32.*/
          double* tilePos =
              &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
          for (k = 0; k < tilesize; k++) {
            for (l = 0; l < tilesize; l++) {
              tilePos[l] = tile[l];
            }
            /* Next 32 elements are the position of the tile in the next column.*/
            tile = &tile[tilesize];
            /* Move to the j*32th position in the next column. */
            tilePos = &tilePos[tilesize * m];
          }
        }
      }
      return ColumnMatrix;
    }
    
    /* Routines for the tiled QR decomposition.*/
    
    /**
     *
     * @brief Computes the QR decomposition of a tile.
     *
     * @param cornerTile A pointer to the tile for which the decomposition is
    * computed.
     * @param tilesize The number of elements on a row/column of the tile.
     * @param tauMatrix A pointer to the tau Matrix.
     * @param k The value of k for the QR decomposition
     * @param tauNum The number of tau values stored for each row of the matrix
    * (this is equal to the number of tiles on each column).
     *
     *
     */
    void DGEQRF(double* restrict cornerTile, int tileSize,
                double* restrict tauMatrix, int k, int tauNum) {
      int i, j, n;
      double norm = 0.0, sign, u1, tau, z;
      double w[tileSize];
    
      /*Find the householder vector for each row. */
      for (i = 0; i < tileSize; i++) {
        norm = 0.0;
        /*Fill w with the vector.*/
        for (j = i; j < tileSize; j++) {
          /* ith row is i*tileSize, only want elements on diagonal or below.*/
          w[j] = cornerTile[i * tileSize + j];
          /*Find the norm as well */
          norm = norm + w[j] * w[j];
        }
        if (w[i] >= 0.0)
          sign = -1;
        else
          sign = 1;
    
        norm = sqrt(norm);
    
        u1 = w[i] - sign * norm;
    
        if (u1 != 0.0) {
          for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
        } else {
          for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
        }
    
        if (norm != 0.0)
          tau = -sign * u1 / norm;
        else
          tau = 0.0;
    
        /*Store the below diagonal vector */
    
        for (j = i + 1; j < tileSize; j++) {
          cornerTile[i * tileSize + j] = w[j];
        }
        cornerTile[i * tileSize + i] = sign * norm;
        w[i] = 1.0;
        /* Apply the householder transformation to the rest of the tile, for
         * everything to the right of the diagonal. */
        for (j = i + 1; j < tileSize; j++) {
          /*Compute w'*A_j*/
          z = cornerTile[j * tileSize + i];
          for (n = i + 1; n < tileSize; n++) {
            z = z + cornerTile[j * tileSize + n] * w[n];
          }
          /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
          for (n = i; n < tileSize; n++) {
            cornerTile[j * tileSize + n] =
                cornerTile[j * tileSize + n] - tau * w[n] * z;
          }
        }
        /* Store tau. We're on k*tileSize+ith row. kth column.*/
    //    tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
          tauMatrix[i] = tau;
      }
    }
    
    /**
     *
     * @brief Applies the householder factorisation of the corner to the row tile.
     *
     * @param cornerTile A pointer to the tile for which the householder is stored.
     * @param rowTiles The tile to which the householder is applied.
     * @param tilesize The number of elements on a row/column of the tile.
     * @param tauMatrix A pointer to the tau Matrix.
     * @param jj The value of j for the QR decomposition.
     * @param kk The value of k for the QR decomposition.
     * @param tauNum The number of tau values stored for each row of the matrix
    * (this is equal to the number of tiles on each column).
     *
     *
     */
    void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
                int jj, int kk, double* restrict tauMatrix, int tauNum) {
      int i, j, n;
      double z = 0.0;
      double w[tileSize];
    
      /* For each row in the corner Tile*/
      for (i = 0; i < tileSize; i++) {
        /*Get w for row i */
        for (j = i; j < tileSize; j++) {
          w[j] = cornerTile[i * tileSize + j];
        }
        w[i] = 1.0;
    
        /* Apply to the row Tile */
        for (j = 0; j < tileSize; j++) {
          z = 0.0;
          /* Below Diagonal!*/
          /*Compute w'*A_j*/
          for (n = i; n < tileSize; n++) {
            z = z + w[n] * rowTile[j * tileSize + n];
          }
          for (n = i; n < tileSize; n++) {
            rowTile[j * tileSize + n] =
                rowTile[j * tileSize + n] -
                //tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
                tauMatrix[i] * w[n] * z;
          }
        }
      }
    }
    
    /**
     *
     * @brief Applies the householder factorisation of the corner to the row tile.
     *
     * @param cornerTile The corner tile for this value of k.
     * @param columnTile The tile for which householders are computed.
     * @param tilesize The number of elements on a row/column of the tile.
     * @param tauMatrix A pointer to the tau Matrix.
     * @param ii The value of i for the QR decomposition.
     * @param kk The value of k for the QR decomposition.
     * @param tauNum The number of tau values stored for each row of the matrix
    * (this is equal to the number of tiles on each column).
     *
     *
     */
    void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
                int tilesize, int ii, int kk, double* restrict tauMatrix,
                int tauNum) {
      int i, j, n;
      double norm = 0.0, sign, u1, tau, z;
      double w[2*tilesize];
    
      /* For each column compute the householder vector. */
      for (i = 0; i < tilesize; i++) {
        norm = 0.0;
        w[i] = cornerTile[i * tilesize + i];
        norm = norm + w[i] * w[i];
        for (j = i + 1; j < tilesize; j++) {
          w[j] = 0.0;
        }
        for (j = 0; j < tilesize; j++) {
          w[tilesize + j] = columnTile[i * tilesize + j];
          norm = norm + w[tilesize + j] * w[tilesize + j];
        }
    
        norm = sqrt(norm);
        if (w[i] >= 0.0)
          sign = -1;
        else
          sign = 1;
    
        u1 = w[i] - sign * norm;
        if (u1 != 0.0) {
          for (j = i + 1; j < 2 * tilesize; j++) {
            w[j] = w[j] / u1;
          }
        } else {
          for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
        }
    
        if (norm != 0)
          tau = -sign * u1 / norm;
        else
          tau = 0.0;
    
        /* Apply to each row to the right.*/
        for (j = i; j < tilesize; j++) {
          /* Find w'*A_j, w is 0s except for first value with upper tile.*/
          z = 1.0 * cornerTile[j * tilesize + i];
          for (n = 0; n < tilesize; n++) {
            z = z + w[tilesize + n] * columnTile[j * tilesize + n];
          }
          /* Apply to upper tile.*/
          cornerTile[j * tilesize + i] =
              cornerTile[j * tilesize + i] - tau * 1.0 * z;
          for (n = i + 1; n < tilesize; n++) {
            cornerTile[j * tilesize + n] =
                cornerTile[j * tilesize + n] - tau * w[n] * z;
          }
          /* Apply to lower tile.*/
          for (n = 0; n < tilesize; n++) {
            columnTile[j * tilesize + n] =
                columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
          }
        }
        /* Store w*/
        for (j = 0; j < tilesize; j++) {
          columnTile[i * tilesize + j] = w[tilesize + j];
        }
    //    tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
          tauMatrix[i] = tau;
      }
    }
    
    /**
     *
     * @brief Applies the householder factorisation of the corner to the row tile.
     *
     * @param cornerTile A pointer to the tile to have the householder applied.
     * @param columnTile The tile in which the householders are stored.
     * @param rowTile The upper tile to have the householders applied.
     * @param tilesize The number of elements on a row/column of the tile.
     * @param tauMatrix A pointer to the tau Matrix.
     * @param ii The value of i for the QR decomposition.
     * @param jj The value of j for the QR decomposition.
     * @param kk The value of k for the QR decomposition.
     * @param tauNum The number of tau values stored for each row of the matrix
    * (this is equal to the number of tiles on each column).
     *
     *
     */
    void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
                double* restrict rowTile, int tilesize, int ii, int jj, int kk,
                double* restrict tauMatrix, int tauNum) {
      int i, j, n;
      double z;
      double w[2*tilesize];
    
      for (i = 0; i < tilesize; i++) {
        for (j = 0; j < i; j++) w[j] = 0.0;
        w[i] = 1.0;
        for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
        for (j = 0; j < tilesize; j++)
          w[j + tilesize] = columnTile[i * tilesize + j];
    
        /* Apply householder vector (w) to the tiles.*/
        for (j = 0; j < tilesize; j++) {
          z = 0.0;
          /* Compute w' * A_j */
          for (n = 0; n < tilesize; n++) {
            z += w[n] * rowTile[j * tilesize + n];
            z += w[n + tilesize] * cornerTile[j * tilesize + n];
          }
          for (n = 0; n < tilesize; n++) {
            rowTile[j * tilesize + n] =
                rowTile[j * tilesize + n] -
                //tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
                tauMatrix[i] * w[n] * z;
            cornerTile[j * tilesize + n] =
                cornerTile[j * tilesize + n] -
            //    tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
                tauMatrix[i] * w[tilesize+ n] * z;
          }
        }
      }
    }
    
    
    
    void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
    
      int k, j, i;
      double* A, *A_orig= NULL, *tau;
      struct qsched s;
      qsched_task_t* tid = NULL, tid_new = -1;
      qsched_res_t *rid = NULL, *tau_id = NULL;
      double **rids = NULL, **taus = NULL;
      int data[3];
    #ifdef TASK_TIMERS
      long long int MPI_data[7];
    #else
      long long int MPI_data[6];
    #endif
    //  ticks tic, toc_run, tot_setup, tot_run = 0;
    #ifdef TASK_TIMERS
      ticks tic, toc_run;
    #endif
    char processor_name[MPI_MAX_PROCESSOR_NAME];
        int name_len;
      #ifdef TASK_TIMERS
      long long int *task_start;
      long long int *task_finish;
      int *task_tids;
      int *task_types;
      #endif
    
        // Initialize the MPI environment
    int MpiThreadLevel;
     MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &MpiThreadLevel);
    
    if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
        error("We didn't get thread multiple!");
    
        MPI_Get_processor_name(processor_name, &name_len);
        MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN );
    
      enum task_types {
        task_DGEQRF=1,
        task_DLARFT,
        task_DTSQRF,
        task_DSSRFT
      };
    
        printf("Task_DGEQRF = %i\n", task_DGEQRF);
      /* Runner function to pass to the scheduler. */
      void runner(struct qsched *s, int type, void * data) {
        #ifdef TASK_TIMERS
        tic = getticks();
        #endif
        /* Decode the task data. */
        long long int* idata = (long long int*)data;
    //    int i = idata[0], j = idata[1], k = idata[2];
        long long int i, j , a , t;
        long long int tid = -1;
        /* Need to pull the resources.*/
    
        /* Decode and execute the task. */
        switch (type) {
          case task_DGEQRF:
            i = idata[0];
            j = idata[1];
            double *cornerTile = (double*)qsched_getresdata(s, i);
            double *tau = (double*)qsched_getresdata(s, j);
            DGEQRF(cornerTile, K, tau, k, m);
            #ifdef TASK_TIMERS
            tid = (idata[2] * m * m);
            #endif
            break;
          case task_DLARFT:
            t = idata[2];
            i = idata[0];
            j = idata[1];
            double *lockedTile = (double*)qsched_getresdata(s, i);
            cornerTile = (double*)qsched_getresdata(s, j);
            tau = (double*)qsched_getresdata(s, t);
            DLARFT(cornerTile, lockedTile, K, j, k, tau,
                   m);
            #ifdef TASK_TIMERS
            tid = (idata[3] * m * m) + (idata[4] * m);
            #endif
            break;
          case task_DTSQRF:
            i = idata[0];
            j = idata[1];
            t = idata[2];
            lockedTile = (double*)qsched_getresdata(s, i);
            cornerTile = (double*)qsched_getresdata(s, j);
            tau = (double*)qsched_getresdata(s, t);
            DTSQRF(cornerTile, lockedTile, K, i, k, tau,
                   m);
            #ifdef TASK_TIMERS
            tid = (idata[3] * m * m) + (idata[3] * m) + idata[4];
            #endif
            break;
          case task_DSSRFT:
            a = idata[2];
            i = idata[0];
            j = idata[1];
            t = idata[3];
            double *lockedTile1 = (double*)qsched_getresdata(s, i);
            double *usedTile = (double*)qsched_getresdata(s, j);
            double *lockedTile2 = (double*)qsched_getresdata(s, a);
            tau = (double*)qsched_getresdata(s, t);
            DSSRFT(lockedTile1, usedTile,
                   lockedTile2, K, i, j, k, tau, m);
            #ifdef TASK_TIMERS
            tid = (idata[4] * m * m) + (idata[6] * m) + (idata[5]);
            #endif
            break;
    //      default:
      //      error("Unknown task type.");
        }
        #ifdef TASK_TIMERS
        if(type > 0){
            toc_run = getticks();
            task_start[tid] = tic;
            task_finish[tid] = toc_run;
            task_tids[tid] = omp_get_thread_num();
            task_types[tid] = type;
        }
        #endif
      }
    
        qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD);
    
    #ifdef TASK_TIMERS
    task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K);
    printf("Created task_start of size %i", m*n*K*K);
    task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K);
    task_tids = (int*)calloc(sizeof(int), m*n*K*K);
    task_types = (int*)calloc(sizeof(int), m*n*K*K);
    #endif
    
    if(s.rank == 0){
      /* Allocate and fill the original matrix. */
      if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
          (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
          (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
        error("Failed to allocate matrices.");
      for (k = 0; k < m * n * K * K; k++) {
        if (matrix == NULL)
          A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
        else
          A_orig[k] = matrix[k];
      }
      memcpy(A, A_orig, sizeof(double) * m * n * K * K);
      bzero(tau, sizeof(double) * m * n * K);
    
      /* Initialize the scheduler. */
        /* Allocate and init the task ID and resource ID matrix. */
    //  tic = getticks();
      if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
          (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
        error("Failed to allocate tid/rid matrix.");
      if( (rids = (double**)malloc(sizeof(double*)*m*n)) == NULL) 
        error("Failed to allocate rids.");
      if( (tau_id = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m*n) ) == NULL)
        error("Failed to allocate tau_id matrix.");
      if( (taus = (double**)malloc(sizeof(double*)*m*n)) == NULL)
        error("Failed to allocate taus.");
      for (k = 0; k < m * n; k++) {
        //tid[k] = qsched_task_none;
        rid[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K * K, (void**) &rids[k]);
        tau_id[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K, (void**) &taus[k]);
        memcpy(taus[k], tau, sizeof(double) * K);
      }
    }
        message("Synchronizing resources.");
        qsched_sync_resources(&s);
    
    if(s.rank == 0) {
      for (k = 0; k < m * n; k++) {
        tid[k] = qsched_task_none;
      }
    /* Build the tasks. */
      for (k = 0; k < m && k < n; k++) {
    
        /* Add kth corner task. */
        data[0] = k;
        data[1] = k;
        data[2] = k;
        MPI_data[0] = rid[k*m+k];
        MPI_data[1] = tau_id[k*m+k];
        #ifdef TASK_TIMERS
        MPI_data[2] = k;
        tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
                                   sizeof(long long int) * 3, 300);
        #else
        tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
                                 sizeof(long long int) * 2, 200);
        #endif
        qsched_addlock(&s, tid_new, rid[k * m + k]);
        qsched_addlock(&s, tid_new, tau_id[k*m+k]);
        if(k == 0)
        {
            memcpy(rids[k*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
        }
        if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
        tid[k * m + k] = tid_new;
    
        /* Add column tasks on kth row. */
        for (j = k + 1; j < n; j++) {
          data[0] = k;
          data[1] = j;
          data[2] = k;
          MPI_data[0] = rid[j*m+k];
          MPI_data[1] = rid[k*m+k];
          MPI_data[2] = tau_id[k*m+k];
          #ifdef TASK_TIMERS
          MPI_data[3] = k;
          MPI_data[4] = j;
          tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
                                   sizeof(long long int) * 5, 300);
          #else
          tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
                                   sizeof(long long int) * 3, 300);
          #endif
            if(k == 0)
            {
               memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
            }
          qsched_addlock(&s, tid_new, rid[j * m + k]);
          qsched_adduse(&s, tid_new, rid[k * m + k]);
          qsched_adduse(&s, tid_new, tau_id[k*m+k]);
          qsched_addunlock(&s, tid[k * m + k], tid_new);
          if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
          tid[j * m + k] = tid_new;
        }
    
        /* For each following row... */
        for (i = k + 1; i < m; i++) {
    
          /* Add the row taks for the kth column. */
          data[0] = i;
          data[1] = k;
          data[2] = k;
          MPI_data[0] = rid[k*m+i];
          MPI_data[1] = rid[k*m+k];
          MPI_data[2] = tau_id[k*m+i];
          #ifdef TASK_TIMERS
          MPI_data[3] = k;
          MPI_data[4] = i;
          tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
                                   sizeof(long long int) * 5, 300);
          #else
          tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
                                   sizeof(long long int) * 3, 300);
          #endif
            if(k == 0)
            {
                memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
            }
          qsched_addlock(&s, tid_new, rid[k * m + i]);
          qsched_adduse(&s, tid_new, rid[k * m + k]);
          qsched_addlock(&s, tid_new, tau_id[k*m+i]);
          qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
          if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
          tid[k * m + i] = tid_new;
          /* Add the inner tasks. */
          for (j = k + 1; j < n; j++) {
            data[0] = i;
            data[1] = j;
            data[2] = k;
            MPI_data[0] = rid[j*m+i];
            MPI_data[1] = rid[k*m+i];
            MPI_data[2] = rid[j*m+k];
            MPI_data[3] = tau_id[k*m+i];
            #ifdef TASK_TIMERS
            MPI_data[4] = k;
            MPI_data[5] = i;
            MPI_data[6] = j;
            tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
                                     sizeof(long long int) * 7, 300);
            #else
            tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
                                     sizeof(long long int) * 4, 500);
            #endif
            if(k == 0)
            {
                memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
            }
            qsched_addlock(&s, tid_new, rid[j * m + i]);
            qsched_adduse(&s, tid_new, rid[k * m + i]);
            qsched_addlock(&s, tid_new, rid[j * m + k]);
            qsched_adduse(&s, tid_new, tau_id[k*m+i]);
            // qsched_addunlock(&s, tid[k * m + i], tid_new);
            qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
            if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
    
            tid[j * m + i] = tid_new;
          }
        }
    
      } /* build the tasks. */
    }
    qsched_run_MPI(&s, nr_threads, runner);
      // Print off a hello world message
        printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
               processor_name, s.rank, s.count_ranks);
    
     //TODO Clean up the resource data.
        qsched_free(&s);
    
    #ifdef TASK_TIMERS
    FILE *file = NULL;
        if(s.rank == 0)
            file = fopen("Task_timers0.out", "w");
        else if (s.rank == 1)
            file = fopen("Task_timers1.out", "w");
        else if (s.rank == 2)
            file = fopen("Task_timers2.out", "w");
        else if (s.rank == 3)
            file = fopen("Task_timers3.out", "w");
    
        for(i = 0; i < m*n*K*K; i++)
        {
            if(task_types[i] > 0)
                fprintf(file, "%i %i %lli %lli\n", task_types[i], task_tids[i],task_start[i], task_finish[i]  );
        }
    
    
        fclose(file);
        free(task_types);
        free(task_tids);
        free(task_start);
        free(task_finish);
    #endif
        // Finalize the MPI environment.
        MPI_Finalize();
    }
    
    /**
     * @brief Main function.
     */
    
    int main(int argc, char* argv[]) {
    
      int c, nr_threads=1;
      int M = 4, N = 4, runs = 1, K = 32;
    
    /* Get the number of threads. */
    //#pragma omp parallel shared(nr_threads)
      //{
       // if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
      //}
    
      /* Parse the options */
      while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
          case 'm':
            if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
            break;
          case 'n':
            if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
            break;
          case 'k':
            if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
            break;
          case 'r':
            if (sscanf(optarg, "%d", &runs) != 1)
              error("Error parsing number of runs.");
            break;
          case 't':
            if (sscanf(optarg, "%d", &nr_threads) != 1)
              error("Error parsing number of threads.");
            omp_set_num_threads(nr_threads);
            message("omp_get_max_threads() = %i\n", omp_get_max_threads());
            message("omp_get_num_procs() = %i\n", omp_get_num_procs());
            break;
          case '?':
            fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
                    argv[0]);
            fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
                            "matrix using nr_threads threads.\n");
            exit(EXIT_FAILURE);
        }
    
      /* Dump arguments. */
      message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
              "threads (%i runs).",
              32 * M, 32 * N, nr_threads, runs);
    
      test_qr(M, N, K, nr_threads, runs, NULL);
    }