HPCS lab2

High performance computing systems Lab 2

Dept. of Computer Architecture

Faculty of ETI

Gdansk University of Technology

Paweł Czarnul

The following example presents a simple master-slave application for parallel integration of a given function.

The code assumes that the range to be integrated is divided into a predefined number of subranges.

This number is to be considerably larger than the number of processes in the application. There are some advantages of this approach. The application balances computations among slave processes quite well even if:

1. computations performed by slaves are not equal in terms of the execution time. This is important as in some cases the execution time is not known in advance; 2. particular slaves run on processors of different speeds; 3. some nodes are overloaded by processes of other users.

Still, there is room for improvement:

1. using MPI_I* functions for non-blocking communication and making sure that the slave already has data for following computations.

2. making the master compute parts of data as well.

Note that the example uses a very simple method for integration. A more advanced approach would be to implement an adaptive method such as adaptive quadrature integration. In this case, the precision is dynamically adjusted based on the given function. On the other hand, subranges (with different precision values) require different amounts of time.

#include <stdio.h>

#include <mpi.h>

#include <math.h>

#define PRECISION 0.000001

#define RANGESIZE 1

#define DATA 0

#define RESULT 1

#define FINISH 2

//#define DEBUG

double f(double x) {

return sin(x)*sin(x)/x;

}

double SimpleIntegration(double a,double b) {

double i;

double sum=0;

for (i=a;i<b;i+=PRECISION)

sum+=f(i)*PRECISION;

return sum;

}

int main(int argc, char **argv) {

int myrank,proccount;

double a=1,b=100;

double range[2];

double result=0,resulttemp;

int sentcount=0;

int i;

MPI_Status status;

// Initialize MPI

MPI_Init(&argc, &argv);

// find out my rank

MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

// find out the number of processes in MPI_COMM_WORLD

MPI_Comm_size(MPI_COMM_WORLD, &proccount); if (proccount<2) {

printf("Run with at least 2 processes"); MPI_Finalize();

return -1;

}

if (((b-a)/RANGESIZE)<2*(proccount-1)) {

printf("More subranges needed"); MPI_Finalize();

return -1;

}

// now the master will distribute the data and slave processes will perform computations if (myrank==0) {

range[0]=a;

// first distribute some ranges to all slaves for(i=1;i<proccount;i++) {

range[1]=range[0]+RANGESIZE;

#ifdef DEBUG

printf("\nMaster sending range %f,%f to process %d",range[0],range[1],i); fflush(stdout);

#endif

// send it to process i

MPI_Send(range,2,MPI_DOUBLE,i,DATA,MPI_COMM_WORLD); sentcount++;

range[0]=range[1];

}

do {

// distribute remaining subranges to the processes which have completed their parts MPI_Recv(&resulttemp,1,MPI_DOUBLE,MPI_ANY_SOURCE,RESULT,MPI_COMM_WOR

LD,&status);

result+=resulttemp;

#ifdef DEBUG

printf("\nMaster received result %f from process %d",resulttemp,status.MPI_SOURCE); fflush(stdout);

#endif

// check the sender and send some more data range[1]=range[0]+RANGESIZE;

if (range[1]>b) range[1]=b;

#ifdef DEBUG

printf("\nMaster sending range %f,%f to process %d",range[0],range[1],status.MPI_SOURCE); fflush(stdout);

#endif

MPI_Send(range,2,MPI_DOUBLE,status.MPI_SOURCE,DATA,MPI_COMM_WORLD); range[0]=range[1];

} while (range[1]<b);

// now receive results from the processes for(i=0;i<(proccount-1);i++) {

MPI_Recv(&resulttemp,1,MPI_DOUBLE,MPI_ANY_SOURCE,RESULT,MPI_COMM_WOR

LD,&status);

#ifdef DEBUG

printf("\nMaster received result %f from process %d",resulttemp,status.MPI_SOURCE); fflush(stdout);

#endif

result+=resulttemp;

}

// shut down the slaves

for(i=1;i<proccount;i++) {

MPI_Send(NULL,0,MPI_DOUBLE,i,FINISH,MPI_COMM_WORLD);

}

// now display the result

printf("\nHi, I am process 0, the result is %f\n",result);

} else { // slave

// this is easy - just receive data and do the work do {

MPI_Probe(0,MPI_ANY_TAG,MPI_COMM_WORLD,&status); if (status.MPI_TAG==DATA) {

MPI_Recv(range,2,MPI_DOUBLE,0,DATA,MPI_COMM_WORLD,&status);

// compute my part

resulttemp=SimpleIntegration(range[0],range[1]);

// send the result back

MPI_Send(&resulttemp,1,MPI_DOUBLE,0,RESULT,MPI_COMM_WORLD);

}

} while (status.MPI_TAG!=FINISH);

}

// Shut down MPI

MPI_Finalize();

return 0;

}

How to compile and run on the KASK cluster?

Note that:

-np 2 means 1 slave

-np 3 means 2 slaves

-np 5 means 4 slaves

[klaster@n01 ~]$ mpicc program2.c

[klaster@n01 ~]$ time mpirun -np 1 ./a.out Run with at least 2 processes

real

0m0.099s

user

0m0.065s

sys

0m0.029s

[klaster@n01 ~]$ time mpirun -np 2 ./a.out Hi, I am process 0, the result is 2.516266

real

0m19.336s

user

0m38.550s

sys

0m0.057s

[klaster@n01 ~]$ time mpirun -np 3 ./a.out Hi, I am process 0, the result is 2.516266

real

0m9.796s

user

0m29.154s

sys

0m0.102s

[klaster@n01 ~]$ time mpirun -np 5 ./a.out Hi, I am process 0, the result is 2.516266

real

0m6.315s

user

0m26.131s

sys

0m0.164s

[klaster@n01 ~]$