7
lncreasing data locality ofparallel programs executed in embedded systems
Livermore loop Kernel 1 (hydro fragment) Integer loop=10000, ARRAYSIZE = sizeof(int)’1048576 | |||||
i 90 8 80 I 70 o 60 j| 50 = 40 .2 30 i20 IH 10 ' 0 | |||||
42 | |||||
SecjjenSal Execution Time |
Parallel ] |
Parallel SFS |
^aratlel SFSwithArrai Contraction |
Matrix Multiplication Integer N=2048, B=32 | ||||
| 180 |
*181 | |||
8 | ||||
■g 120 S 100 c 80 o 60 3 40 S 20 ^ 0 | ||||
60 | ||||
Sequential Execution Time |
Parallel SFS |
Parallel SFS with Bloeking |
Parallel SFSwith Blocking SArray Conlracłion |
(b)
Figurę 1. Execution time of (a) Livermore Loops Kernel 1 (b) matrix multiplication
#define SIZE 100000000 |
#include <omp.h> |
int main(void) { |
#define SIZE 100000000 int main(void) |
double *a = new double[SIZE]; |
{ |
for(long i=0; i<SIZE; i++) |
omp set num threads(2); |
a[i]=(double)i+1; |
dotible *a=new double [SIZE] ; |
for(long i=0; i<(SIZE-2); i++) |
for(long i=0; i<SIZE; i++) |
a[i+2]=sin((a[i]*SIZE+1)/SIZE); |
a[i]=(double)i+1; |
return 0; |
#pragma omp parallel for |
} |
private(j,i) shared(a) |
(a) Seąuential codę |
for(long j=0; j < 2 ; j++) for(long i=j; i<(SIZE-2); i+=2) a[i+2]=sin((a[i]*SIZE+1)/SIZE); return 0; } (b) Parallel codę |
Figurę 2. Decreased performance in the parallel version of a simple codę