Skip to content
Snippets Groups Projects
test_qr_ompss.c 17.21 KiB
/*******************************************************************************
 * This file is part of QuickSched.
 * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
* *****************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <omp.h>

/* LAPACKE header. */
#include <lapacke.h>
#include <cblas.h>

/* Local includes. */
#include "cycle.h"

/* Error macro. */
#define error(s, ...)                                                        \
  {                                                                          \
    fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \
            ##__VA_ARGS__);                                                  \
    abort();                                                                 \
  }

/* Message macro. */
#define message(s, ...)                                 \
  {                                                     \
    printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__); \
    fflush(stdout);                                     \
  }

/* Stuff to collect task data. */
struct timer {
  int threadID, type;
  ticks tic, toc;
};
struct timer* timers;
int nr_timers = 0;


double* columnToTile(double* columnMatrix, int size, int m, int n,
                     int tilesize) {
  double* TileMatrix;
  TileMatrix = malloc(sizeof(double) * size);
  if (TileMatrix == NULL) error("failed to allocate TileMatrix");
  int i, j, k, l;

  for (i = 0; i < n; i++) {
    for (j = 0; j < m; j++) {
      double* tileStart =
          &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
      double* tilePos =
          &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
      for (k = 0; k < tilesize; k++) {
        tileStart = &columnMatrix[i * m * tilesize * tilesize +
                                  k * m * tilesize + j * tilesize];
        for (l = 0; l < tilesize; l++) {
          tilePos[k * tilesize + l] = tileStart[l];
        }
      }
    }
  }

  return TileMatrix;
}

double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
  double* ColumnMatrix;
  ColumnMatrix = (double*)malloc(sizeof(double) * size);
  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
  int i, j, k, l;
  for (i = 0; i < n; i++) {
    for (j = 0; j < m; j++) {
      /* Tile on ith column is at i*m*32*32.*/
      /* Tile on jth is at j*32*32 */
      double* tile =
          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
      /* Column starts at same position as tile. */
      /* Row j*32.*/
      double* tilePos =
          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
      for (k = 0; k < tilesize; k++) {
        for (l = 0; l < tilesize; l++) {
          tilePos[l] = tile[l];
        }
        /* Next 32 elements are the position of the tile in the next column.*/
        tile = &tile[tilesize];
        /* Move to the j*32th position in the next column. */
        tilePos = &tilePos[tilesize * m];
      }
    }
  }
  return ColumnMatrix;
}

/* Routines for the tiled QR decomposition.*/

/**
 *
 * @brief Computes the QR decomposition of a tile.
 *
 * @param cornerTile A pointer to the tile for which the decomposition is
* computed.
 * @param tilesize The number of elements on a row/column of the tile.
 * @param tauMatrix A pointer to the tau Matrix.
 * @param k The value of k for the QR decomposition
 * @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
 *
 *
 */
#pragma omp task inout(cornerTile[0]) out(tauMatrix[0])
void DGEQRF(double* restrict cornerTile, int tileSize,
            double* restrict tauMatrix, int k, int tauNum) {
  int i, j, n;
  double norm = 0.0, sign, u1, tau, z;
  double w[tileSize];

  /* Timer stuff. */
  int ind = __sync_fetch_and_add( &nr_timers , 1 );
  timers[ind].threadID = omp_get_thread_num();
  timers[ind].type = 0;
  timers[ind].tic = getticks();
    
  /*Find the householder vector for each row. */
  for (i = 0; i < tileSize; i++) {
    norm = 0.0;
    /*Fill w with the vector.*/
    for (j = i; j < tileSize; j++) {
      /* ith row is i*tileSize, only want elements on diagonal or below.*/
      w[j] = cornerTile[i * tileSize + j];
      /*Find the norm as well */
      norm = norm + w[j] * w[j];
    }
    if (w[i] >= 0.0)
      sign = -1;
    else
      sign = 1;

    norm = sqrt(norm);

    u1 = w[i] - sign * norm;

    if (u1 != 0.0) {
      for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
    } else {
      for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
    }

    if (norm != 0.0)
      tau = -sign * u1 / norm;
    else
      tau = 0.0;

    /*Store the below diagonal vector */

    for (j = i + 1; j < tileSize; j++) {
      cornerTile[i * tileSize + j] = w[j];
    }
    cornerTile[i * tileSize + i] = sign * norm;
    w[i] = 1.0;
    /* Apply the householder transformation to the rest of the tile, for
     * everything to the right of the diagonal. */
    for (j = i + 1; j < tileSize; j++) {
      /*Compute w'*A_j*/
      z = cornerTile[j * tileSize + i];
      for (n = i + 1; n < tileSize; n++) {
        z = z + cornerTile[j * tileSize + n] * w[n];
      }
      /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
      for (n = i; n < tileSize; n++) {
        cornerTile[j * tileSize + n] =
            cornerTile[j * tileSize + n] - tau * w[n] * z;
      }
    }
    /* Store tau. We're on k*tileSize+ith row. kth column.*/
    tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
  }
  timers[ind].toc = getticks();
}

/**
 *
 * @brief Applies the householder factorisation of the corner to the row tile.
 *
 * @param cornerTile A pointer to the tile for which the householder is stored.
 * @param rowTiles The tile to which the householder is applied.
 * @param tilesize The number of elements on a row/column of the tile.
 * @param tauMatrix A pointer to the tau Matrix.
 * @param jj The value of j for the QR decomposition.
 * @param kk The value of k for the QR decomposition.
 * @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
 *
 *
 */
#pragma omp task in(cornerTile[0]) inout(rowTile[0]) in(tauMatrix[0])
void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
            int jj, int kk, double* restrict tauMatrix, int tauNum) {
  int i, j, n;
  double z = 0.0;
  double w[tileSize];

  /* Timer stuff. */
  int ind = __sync_fetch_and_add( &nr_timers , 1 );
  timers[ind].threadID = omp_get_thread_num();
  timers[ind].type = 1;
  timers[ind].tic = getticks();
    
  /* For each row in the corner Tile*/
  for (i = 0; i < tileSize; i++) {
    /*Get w for row i */
    for (j = i; j < tileSize; j++) {
      w[j] = cornerTile[i * tileSize + j];
    }
    w[i] = 1.0;

    /* Apply to the row Tile */
    for (j = 0; j < tileSize; j++) {
      z = 0.0;
      /* Below Diagonal!*/
      /*Compute w'*A_j*/
      for (n = i; n < tileSize; n++) {
        z = z + w[n] * rowTile[j * tileSize + n];
      }
      for (n = i; n < tileSize; n++) {
        rowTile[j * tileSize + n] =
            rowTile[j * tileSize + n] -
            tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
      }
    }
  }
  timers[ind].toc = getticks();
}

/**
 *
 * @brief Applies the householder factorisation of the corner to the row tile.
 *
 * @param cornerTile The corner tile for this value of k.
 * @param columnTile The tile for which householders are computed.
 * @param tilesize The number of elements on a row/column of the tile.
 * @param tauMatrix A pointer to the tau Matrix.
 * @param ii The value of i for the QR decomposition.
 * @param kk The value of k for the QR decomposition.
 * @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
 *
 *
 */
#pragma omp task inout(cornerTile[0]) inout(columnTile[0]) out(tauMatrix[0])
void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
            int tilesize, int ii, int kk, double* restrict tauMatrix,
            int tauNum) {
  int i, j, n;
  double norm = 0.0, sign, u1, tau, z;
  double w[2*tilesize];

  /* Timer stuff. */
  int ind = __sync_fetch_and_add( &nr_timers , 1 );
  timers[ind].threadID = omp_get_thread_num();
  timers[ind].type = 2;
  timers[ind].tic = getticks();
    
  /* For each column compute the householder vector. */
  for (i = 0; i < tilesize; i++) {
    norm = 0.0;
    w[i] = cornerTile[i * tilesize + i];
    norm = norm + w[i] * w[i];
    for (j = i + 1; j < tilesize; j++) {
      w[j] = 0.0;
    }
    for (j = 0; j < tilesize; j++) {
      w[tilesize + j] = columnTile[i * tilesize + j];
      norm = norm + w[tilesize + j] * w[tilesize + j];
    }

    norm = sqrt(norm);
    if (w[i] >= 0.0)
      sign = -1;
    else
      sign = 1;

    u1 = w[i] - sign * norm;
    if (u1 != 0.0) {
      for (j = i + 1; j < 2 * tilesize; j++) {
        w[j] = w[j] / u1;
      }
    } else {
      for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
    }

    if (norm != 0)
      tau = -sign * u1 / norm;
    else
      tau = 0.0;

    /* Apply to each row to the right.*/
    for (j = i; j < tilesize; j++) {
      /* Find w'*A_j, w is 0s except for first value with upper tile.*/
      z = 1.0 * cornerTile[j * tilesize + i];
      for (n = 0; n < tilesize; n++) {
        z = z + w[tilesize + n] * columnTile[j * tilesize + n];
      }
      /* Apply to upper tile.*/
      cornerTile[j * tilesize + i] =
          cornerTile[j * tilesize + i] - tau * 1.0 * z;
      for (n = i + 1; n < tilesize; n++) {
        cornerTile[j * tilesize + n] =
            cornerTile[j * tilesize + n] - tau * w[n] * z;
      }
      /* Apply to lower tile.*/
      for (n = 0; n < tilesize; n++) {
        columnTile[j * tilesize + n] =
            columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
      }
    }
    /* Store w*/
    for (j = 0; j < tilesize; j++) {
      columnTile[i * tilesize + j] = w[tilesize + j];
    }
    tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
  }
  timers[ind].toc = getticks();
}

/**
 *
 * @brief Applies the householder factorisation of the corner to the row tile.
 *
 * @param cornerTile A pointer to the tile to have the householder applied.
 * @param columnTile The tile in which the householders are stored.
 * @param rowTile The upper tile to have the householders applied.
 * @param tilesize The number of elements on a row/column of the tile.
 * @param tauMatrix A pointer to the tau Matrix.
 * @param ii The value of i for the QR decomposition.
 * @param jj The value of j for the QR decomposition.
 * @param kk The value of k for the QR decomposition.
 * @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
 *
 *
 */
#pragma omp task inout(cornerTile[0]) in(columnTile[0]) inout(rowTile[0]) in(tauMatrix[0])
void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
            double* restrict rowTile, int tilesize, int ii, int jj, int kk,
            double* restrict tauMatrix, int tauNum) {
  int i, j, n;
  double z;
  double w[2*tilesize];

  /* Timer stuff. */
  int ind = __sync_fetch_and_add( &nr_timers , 1 );
  timers[ind].threadID = omp_get_thread_num();
  timers[ind].type = 3;
  timers[ind].tic = getticks();
    
  for (i = 0; i < tilesize; i++) {
    for (j = 0; j < i; j++) w[j] = 0.0;
    w[i] = 1.0;
    for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
    for (j = 0; j < tilesize; j++)
      w[j + tilesize] = columnTile[i * tilesize + j];

    /* Apply householder vector (w) to the tiles.*/
    for (j = 0; j < tilesize; j++) {
      z = 0.0;
      /* Compute w' * A_j */
      for (n = 0; n < tilesize; n++) {
        z += w[n] * rowTile[j * tilesize + n];
        z += w[n + tilesize] * cornerTile[j * tilesize + n];
      }
      for (n = 0; n < tilesize; n++) {
        rowTile[j * tilesize + n] =
            rowTile[j * tilesize + n] -
            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
        cornerTile[j * tilesize + n] =
            cornerTile[j * tilesize + n] -
            tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
      }
    }
  }
  timers[ind].toc = getticks();
}

/**
 * @brief Computed a tiled QR factorization using QuickSched.
 *
 * @param m Number of tile rows.
 * @param n Number of tile columns.
 * @param nr_threads Number of threads to use.
 */

void test_qr(int m, int n, int K, int nr_threads, int runs) {

  int k, j, i, r;
  double* A, *A_orig, *tau;
  ticks tic, toc_run, tot_run = 0;
  double tid[ m * n ];

  /* Allocate and fill the original matrix. */
  if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
      (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
      (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
    error("Failed to allocate matrices.");
  for (k = 0; k < m * n * K * K; k++)
    A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
  memcpy(A, A_orig, sizeof(double) * m * n * K * K);
  bzero(tau, sizeof(double) * m * n * K);

  /* Dump A_orig. */
  /* message( "A_orig = [" );
  for ( k = 0 ; k < m*K ; k++ ) {
      for ( j = 0 ; j < n*K ; j++ )
          printf( "%.3f " , A_orig[ j*m*K + k ] );
      printf( "\n" );
      }
  printf( "];\n" ); */

  /* Loop over the number of runs. */
  for (r = 0; r < runs; r++) {

    /* Start the clock. */
    tic = getticks();
    nr_timers = 0;

    /* Launch the tasks. */
    for (k = 0; k < m && k < n; k++) {

      /* Add kth corner task. */
      // #pragma omp task inout( tid[ k*m + k ] )
      DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);

      /* Add column tasks on kth row. */
      for (j = k + 1; j < n; j++) {
        // #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] )
        DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
               m);
      }

      /* For each following row... */
      for (i = k + 1; i < m; i++) {

        /* Add the row taks for the kth column. */
        // #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] )
        DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
               m);

        /* Add the inner tasks. */
        for (j = k + 1; j < n; j++) {
          // #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[
          // j*m + k ] )
          DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
                 &A[(j * m + k) * K * K], K, i, j, k, tau, m);
        }
      }

    } /* build the tasks. */

/* Collect timers. */
#pragma omp taskwait
    toc_run = getticks();
    message("%ith run took %lli ticks...", r, toc_run - tic);
    tot_run += toc_run - tic;
  }

  /* Dump the costs. */
  message("costs: run=%lli ticks.", tot_run / runs);
  /* Dump the tasks. */
  /* for ( k = 0 ; k < nr_timers ; k++ )
      printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type ,
     timers[k].tic , timers[k].toc ); */
}

/**
 * @brief Main function.
 */

int main(int argc, char* argv[]) {

  int c, nr_threads;
  int M = 4, N = 4, runs = 1, K = 32;

/* Get the number of threads. */
#pragma omp parallel shared(nr_threads)
  {
    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
  }

  /* Parse the options */
  while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
      case 'm':
        if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
        break;
      case 'n':
        if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
        break;
      case 'k':
        if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
        break;
      case 'r':
        if (sscanf(optarg, "%d", &runs) != 1)
          error("Error parsing number of runs.");
        break;
      case 't':
        if (sscanf(optarg, "%d", &nr_threads) != 1)
          error("Error parsing number of threads.");
        omp_set_num_threads(nr_threads);
        break;
      case '?':
        fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
                argv[0]);
        fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
                        "matrix using nr_threads threads.\n");
        exit(EXIT_FAILURE);
    }

  /* Dump arguments. */
  message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
          "threads (%i runs).",
          K * M, K * N, nr_threads, runs);

  /* Initialize the timers. */
  if ((timers = (struct timer*)malloc(sizeof(struct timer) * M * M * N)) ==
      NULL)
    error("Failed to allocate timers.");

  test_qr(M, N, K, nr_threads, runs);
}