B.2. USER S GUIDE | Parallel Computing on Heterogeneous Networks (Wiley Series on Parallel and Distributed Computing)

B.1. SOURCE CODE

The text of the mpC program for parallel block cyclic matrix multiplication presented in Section 9.1.1.3 is broken down into four source files.

The first file contains the declaration of the ParallelAxB network type. The name of the second source file must be ParallelAxB.mpc. The contents of this file are as follows:

#define H(a, b, c, d, p) h[(a*p*p*p+b*p*p+c*p+d)] typedef struct {int I; int J;} Processor; nettype ParallelAxB(int p, int r, int n, int l, int w[p], int h[p*p*p*p]) {    coord I=p, J=p;    node {I>=0 && J>=0: bench*(w[J]*H(I, J, I, J,     p)*(n/l)*(n/l)*n);};    link (K=p, L=p)    {       I>=0 && J>=0 && I!=K : length*          (w[I]*H(I,J,I,J,p)*(n/l)*(n/l)*(r*r)*sizeof(double) [I,J]->[K,J];       I>=0 && J>=0 && J!=L && (H(I, J, K, L, p)>0) :       length*          (w[J]*H(I,J,K,L,p)*(n/l)*(n/l)*(r*r)*sizeof(double)) [I,J]->[K,L];    };    parent[0];    scheme    {       int k;       Processor Root, Receiver, Current;       for(k = 0; k < n; k++)       {          int Acolumn = k%l, Arow;          int Brow = k%l, Bcolumn;          par(Arow = 0; Arow <l; )          {             GetProcessor(Arow, Acolumn, p, h, w, &Root);             par(Receiver.I = 0; Receiver.I < p; Receiver.I++)                par(Receiver.J = 0; Receiver.J < p;                 Receiver.J++)                   if((Root.I != Receiver.I || Root.J != Receiver.J) &&                       Root.J != Receiver.J)                      if(H((Root.I),(Root.J),(Receiver.I),(Receiver.J),p) > 0)                          100/(w[Root.J]*((n/l))) %%                         [Root.I, Root.J] -> [Receiver.I, Receiver.J];             Arow += H((Root.I),(Root.J),(Root.I),(Root.J),p);          }          par(Bcolumn = 0; Bcolumn < l; )          {             GetProcessor(Brow, Bcolumn, p, h, w, &Root);             par(Receiver.I = 0; Receiver.I < p; Receiver.I++)                if(Root.I != Receiver.I)                (100/(H((Root.I),(Root.J),(Root.I),(Root.J),p)*(n/l))) %%                [Root.I, Root.J] -> [Receiver.I, Root.J];             Bcolumn += w[Root.J];          }          par(Current.I = 0; Current.I < p; Current.I++)          par(Current.J = 0; Current.J < p; Current.J++)                (100/n) %% [Current.I, Current.J];       }    }; };

The second file contains the main function. The name of this file must have extension mpc (e.g., hmxm.mpc). The contents of this file are as follows:

#include <math.h> #include <stdio.h> #include <stdlib.h> #include <mpc.h> #include "ParallelAxB.mpc" #include "Load_balance.mpc" #include "hmxm_i.mpc" int [*] main(int [host] argc, char** [host] argv) {    repl int p, r, n, l, *w, *h, *trow, *lcolumn;    repl double* speeds;    int [host] opt_l;    double [host] time_i, Elapsed_time;    p = [host]atoi(argv[1]);    r = [host]atoi(argv[2]);    n = [host]atoi(argv[3]);    /*     * Update the estimation of the processors speeds     */    {       repl int i, j;       repl double x[r*r], y[r*r], z[r*r];       for (i = 0; i < r; i++)          for (j = 0; j < r; j++) {             x[i*r+j] = 2.;             y[i*r+j] = 2.;             z[i*r+j] = 0.;          }       recon rMxM(x, y, z, r);    }    /*     * Detect the speed of the physical processors     */    speeds = (double*)malloc(sizeof(double)*p*p);    MPC_Get_processors_info(NULL, speeds);    /*             * w — widths of rectangular areas in generalized block             * h — heights of ‘horizontal-neighbours’ sub-rectangles             * trow — Top rows of rectangular areas in generalized block             * lcolumn — Left columns of these rectangular areas             */    w = (int*)malloc(sizeof(int)*p);    h = (int*)malloc(sizeof(int)*(p*p*p*p));    trow = (int*)malloc(sizeof(int)*p*p);    lcolumn = (int*)malloc(sizeof(int)*p);    /*    * Determine the optimal generalized block size    */    [host]:    {       int i;       double algo_time, min_algo_time = DBL_MAX;       for (i = p; i < n; i++) {          DistributeLoad(p, speeds, i, w, h, trow, lcolumn);          algo_time = timeof(net ParallelAxB(p, r, n, i, w, h) paxb);          if (algo_time < min_algo_time) {             opt_l = i;             min_algo_time = algo_time;          }       }    }    /*        * Start timing the algorithm        */    time_i = [host]MPC_Wtime();    /*        * Broadcast the optimal generalised block size        */    l = opt_l;    /*        * Determine widths, top rows, top columns of the rectangular        * areas in generalized block, and the heights of their        * ‘horizontal-neighbours’ sub-rectangles        */    DistributeLoad(p, speeds, l, w, h, trow, lcolumn);    free(speeds);    {       /*       * Create the mpC network             */       net ParallelAxB(p, r, n, l, w, h) g;       int [g] icoord, [g] jcoord;       /*           * Initialize the matrix elements and execute the algorithm           */       [g]:       {          int x, y, i, j, s, t, MyTopRow, MyLeftColumn;          double *a, *b, *c;          a = (double*)malloc(sizeof(double)*(n*r)*(n*r));          b = (double*)malloc(sizeof(double)*(n*r)*(n*r));          c = (double*)malloc(sizeof(double)*(n*r)*(n*r));          icoord = I coordof g;          jcoord = J coordof g;          MyTopRow = trow[icoord*p + jcoord];          MyLeftColumn = lcolumn[jcoord];          for (x = MyTopRow; x < n; x+=l)             for (y = MyLeftColumn; y < n; y+=l)                for (i = 0; i < H(icoord, jcoord, icoord, jcoord, p); i++)                   for (j = 0; j < w[jcoord]; j++)                      for (s = 0; s < r; s++)                         for (t = 0; t < r; t++) {                            a[(x*r*n*r) + y*r + (i*r*n*r) + j*r + (s*r*n) + t] = 2.;                            b[(x*r*n*r) + y*r + (i*r*n*r) + j*r + (s*r*n) + t] = 2.;                            c[(x*r*n*r) + y*r + (i*r*n*r) + j*r + (s*r*n) + t] = 0.;                         }          ([(p, r, n, l, w, h) g])ComputeAxB(                                        icoord, jcoord, a, b, c, trow, lcolumn);          {             free(w);             free(h);             free(trow);             free(lcolumn);             free(a);             free(b);             free(c);          }          ([(0)g])MPC_Barrier();       }    }    /*    * Print the algorithm execution time    */    [host]:    {       Elapsed_time = MPC_Wtime() - time_i;       printf("Problem size=%d, time(sec)=%0.6f, time(min)=%0.6f\n",                        n*r, Elapsed_time,                        Elapsed_time/60.);    } }

The name of the third source file must be hmxm_i.h. This file contains the definitions of functions rMxM, Height, GetProcessor, and ComputeAxB. The contents of this file are as follows:

/*     * This function is used in recon to update the estimation of     * the processors speeds     */ int rMxM (double *a, double *b, double *c, int r) {    int l, m, t;    for (l = 0; l < r; l++)       for (m = 0; m < r; m++)          for (t = 0; t < r; t++)             c[l*r + m] += a[l*r + t] * b[t*r + m] ;          return 0; } /*     * This function gives the height of ‘horizontal-neighbours’ sub-rectangles     * for different pairs of rectangles in generalized block     */ int Height(int top_row_1, int bottom_row_1, int top_row_2, int bottom_row_2) {    if ((top_row_1 >= top_row_2) && (bottom_row_1 <= bottom_row_2))       return (bottom_row_1 - top_row_1);    if ((top_row_1 <= top_row_2) && (bottom_row_1 >= bottom_row_2))       return (bottom_row_2 - top_row_2);    if ((top_row_1 <= top_row_2) && (bottom_row_1 >= top_row_2)         && (bottom_row_1 <= bottom_row_2))       return (bottom_row_1 - top_row_2);    if ( (top_row_1 >= top_row_2) && (top_row_1 <= bottom_row_2)        && (bottom_row_1 >= bottom_row_2))       return (bottom_row_2 - top_row_1);    return 0; } /*     * This function returns the coordinates of the root processor in the     * processor grid. The inputs to this function are the row and column     * of the root processor, the number of processors, the widths of     * the rectangular areas in the generalized block, and the heights of     * the ‘horizontal-neighbours’ sub-rectangles     */ int GetProcessor(const int row, const int column,                             const int p, const int *h,                             const int *w, Processor* proc) {    int x, y, i, j, tempy;    int *trow = (int*)malloc(sizeof(int)*p);    for (i = 0; i < p; i++)       trow[i] = 0;    for (x = 0; x < p; x++)    {       tempy = 0;       for (y = 0; y < p; y++)       {          int hi = H(x, y, x, y, p);          int wi = w[y];          if (x)             trow[y] += H((x-1), y, (x-1), y, p);          for (i = 0; i < hi; i++)             for (j = 0; j < wi; j++)                if (((row >= (trow[y] + i)) && (row < (trow[y] + hi)))&&                      ((column >= (tempy + j)) && (column < (tempy + wi))))                {                   proc->I = x;                   proc->J = y;                   free(trow);                   return 0;                }          tempy += w[y];       }    }    free(trow);    return 0; } /*    * This function performs the computations and communications * of the parallel algorithm */ void [net ParallelAxB(p, r, n, l, w, h) g]    ComputeAxB(int icoord, int jcoord, double *a, double *b,                             double *c, int *trow, int* lcolumn) {    int MyTopRow, MyLeftColumn;    int m, s, t, x, y, z;    repl int i, j, k;    repl Processor Me, Root, Receiver;    Me.I = icoord;    Me.J = jcoord;    MyTopRow = trow[(Me.I)*p + Me.J];    MyLeftColumn = lcolumn[Me.J];    for (k = 0; k < n; k++) {       int Acolumn = (k%l);       int Brow = (k%l);       /*       * Processor P(k,j) broadcasts a(k,j) to P(*,j) vertically       */       for (j = 0; j < n; j++) {          int Bcolumn = (j%l);          GetProcessor(Brow, Bcolumn, p, h, w, &Root);          {             flex subnet[g:(J==(Root.J))] cSubnet;             [cSubnet]:             {                int am_I_root = 0;                double temp[r*r];                if ((Me.I) == (Root.I)) {                   am_I_root = 1;                   for (x = 0; x < r; x++)                      for (y = 0; y < r; y++)                         temp[x*r + y] = b[k*r*n*r + j*r + x*n*r + y];                }                temp[] = [cSubnet:(I==(Root.I) && J==(Root.J))] temp[];                if (!am_I_root)                   for (x = 0; x < r; x++)                      for (y = 0; y < r; y++)                         b[k*r*n*r + j*r + x*n*r + y] = temp[x*r + y];             }          }       }       /*       * Processor P(i,k) broadcasts a(i,k) to P(i,*) horizontally       */       for (i = 0; i < n; i++) {          int Arow = (i%l);          GetProcessor(Arow, Acolumn, p, h, w, &Root);          {             flex subnet                [g:(Height(Arow,Arow+1,trow[I*p+J],trow[I*p+J]+H(I,J,I,J,p))>0)]                rSubnet;             [rSubnet]:             {                int am_I_root = 0;                double temp[r*r];                if (((Root.I) == (Me.I)) && ((Root.J) == (Me.J))) {                   am_I_root = 1;                   for (x = 0; x < r; x++)                      for (y = 0; y < r; y++)                         temp[x*r + y] = a[i*r*n*r + k*r + x*n*r + y];                }                temp[] = [rSubnet:(I==(Root.I) && J==(Root.J))]temp[];                if (!am_I_root)                   for (x = 0; x < r; x++)                      for (y = 0; y < r; y++)                         a[i*r*n*r + k*r + x*n*r + y] = temp[x*r + y];             }          }       }       for (x = MyTopRow; x < n; x+=l)          for (y = MyLeftColumn; y < n; y+=l)             for (i = 0; i < H((Me.I),(Me.J),(Me.I),(Me.J),p); i++)                for (j = 0; j < w[(Me.J)]; j++)                   for (s = 0; s < r; s++)                      for (m = 0; m < r; m++)                         for (t = 0; t < r; t++)                            c[x*r*n*r+y*r+i*r*n*r+j*r+s*n*r+m] +=                            a[x*r*n*r+i*r*n*r+k*r+s*n*r+t] * b[y*r+k*r*n*r+j*r+t*n*r+m];    }    return; }

The name of the fourth source file must be Load_balance.mpc. This file contains the definition of function DistributeLoad. The contents of this file are as follows:

 int DistributeLoad(int p, double *speeds, int l, int *w,                              int *h, int *trow, int* lcolumn) {    int i, j, k, x, y, csum[p], tsum = 0;    double total = 0.;    for (i = 0; i < p; i++)    {       csum[i] = 0;       for (j = 0; j < p; j++)          csum[i] += speeds[j*p + i];       tsum += csum[i];    }    for (i = 0; i < p; i++)       for (j = 0; j < p; j++)       {          double temp = (speeds[i*p+j]/(double)csum[j])*l;          if (temp < 1.0)             temp = 1.0;          temp = floor(temp);          H(i,j,i,j,p) = temp;       }    for (j = 0; j < p; j++)    {       total = 0.;       for (i = 0; i < p; i++)          total += H(i,j,i,j,p);       if (total > l)       {          int ind = 0;          for (i = 0; i < p; i++)             if (H(i,j,i,j,p) > 1.0)             {                ind++;                H(i,j, i,j,p) -= 1.0;                if ((total - ind) == l)                   break;             }       } else if (total < l)       {          int ind = 0;          for (i = 0; i < p; i++)          {             ind++;             H(i,j,i,j,p) += 1.0;             if ((total + ind) == l)                break;          }       }    }    total = 0.;    for (i = 0; i < p; i++)    {       double temp = ((double)csum[i]/(double)tsum)*l;       if (temp < 1.0)          temp = 1.0;       temp = floor(temp);       w[i] = temp;    }    for (i = 0; i < p; i++)       total += w[i];    if (total > l)    {       int ind = 0;       for (i = 0; i < p; i++)          if (w[i] > 1.0)          {             ind++;             w[i] -= 1.0;             if ((total - ind) == l)                break;          }    } else if (total < l)    {       int ind = 0;       for (i = 0; i < p; i++)       {          ind++;          w[i] += 1.0;          if ((total + ind) == l)                break;          }       }       for (i = 0; i < p; i++)          for (j = 0; j < p; j++)          {             trow[i*p+j] = 0;             for (k = 0; k < i; k++)                trow[i*p+j] += H(k, j, k, j, p);          }       for (i = 0; i < p; i++)       {          lcolumn[i] = 0;          for (x = 0; x < i; x++)             lcolumn[i] += w[x];       }       for (i = 0; i < p; i++)          for (j = 0; j < p; j++)             for (x = 0; x < p; x++)                for (y = 0; y < p; y++)                {                   int height = CommonHeight(trow[i*p+j], trow[i*p+j]+H(i,j,i,j,p),                            trow[x*p+y], trow[x*p+y]+H(x,y,x,y,p));                   if (height > 0)                      H(i,j,x,y,p) = height;                   }    return 0; }