bhjghjh - 8 months ago 109

C++ Question

I am trying to learn parallelization with openmp. I have written a c++ script which calculates 10 dimensional integration through MC for the function:

F = x1+ x2 + x3 +...+x10

now I am trying to convert it to work with openmp with 4 threads. my serial code gives intelligible output, so I am kind of convinced that it works fine.

here is my serial code:

I want to output for every 4^k iterations for N= number of sample points.

`/* compile with`

$ g++ -o monte ND_MonteCarlo.cpp

$ ./monte N

unsigned long long int for i, N

Maximum value for UNSIGNED LONG LONG INT 18446744073709551615

*/

#include <iostream>

#include <fstream>

#include <iomanip>

#include <cmath>

#include <cstdlib>

#include <ctime>

using namespace std;

//define multivariate function F(x1, x2, ...xk)

double f(double x[], int n)

{

double y;

int j;

y = 0.0;

for (j = 0; j < n; j = j+1)

{

y = y + x[j];

}

y = y;

return y;

}

//define function for Monte Carlo Multidimensional integration

double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)

{

double r, x[n], v;

int i, j;

r = 0.0;

v = 1.0;

// step 1: calculate the common factor V

for (j = 0; j < n; j = j+1)

{

v = v*(b[j]-a[j]);

}

// step 2: integration

for (i = 1; i <= m; i=i+1)

{

// calculate random x[] points

for (j = 0; j < n; j = j+1)

{

x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));

}

r = r + fn(x,n);

}

r = r*v/m;

return r;

}

double f(double[], int);

double int_mcnd(double(*)(double[],int), double[], double[], int, int);

int main(int argc, char **argv)

{

/* define how many integrals */

const int n = 10;

double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,5.0};

double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};

double result, mean;

int m;

unsigned long long int i, N;

// initial seed value (use system time)

srand(time(NULL));

cout.precision(6);

cout.setf(ios::fixed | ios::showpoint);

// current time in seconds (begin calculations)

time_t seconds_i;

seconds_i = time (NULL);

m = 4; // initial number of intervals

// convert command-line input to N = number of points

N = atoi( argv[1] );

for (i=0; i <=N/pow(4,i); i++)

{

result = int_mcnd(f, a, b, n, m);

mean = result/(pow(10,10));

cout << setw(30) << m << setw(30) << result << setw(30) << mean <<endl;

m = m*4;

}

// current time in seconds (end of calculations)

time_t seconds_f;

seconds_f = time (NULL);

cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;

return 0;

}

and output:

`N integral mean_integral`

4 62061079725.185936 6.206108

16 33459275100.477665 3.345928

64 -2204654740.788784 -0.220465

256 4347440045.990804 0.434744

1024 -1265056243.116922 -0.126506

4096 681660387.953380 0.068166

16384 -799507050.896809 -0.079951

65536 -462592561.594820 -0.046259

262144 50902035.836772 0.005090

1048576 -91104861.129695 -0.009110

4194304 3746742.588701 0.000375

16777216 -32967862.853915 -0.003297

67108864 17730924.602974 0.001773

268435456 -416824.977687 -0.00004

1073741824 2843188.477219 0.000284

But I think my parallel code is not working at all. I know I'm doing something silly of course .As my number of threads are 4, I wanted to divide results by 4, and the output is ridiculous.

here is a parallel version of the same code:

`/* compile with`

$ g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm

$ ./mcOMP N

unsigned long long int for i, N

Maximum value for UNSIGNED LONG LONG INT 18446744073709551615

*/

#include <iostream>

#include <fstream>

#include <iomanip>

#include <cmath>

#include <cstdlib>

#include <ctime>

#include <omp.h>

using namespace std;

//define multivariate function F(x1, x2, ...xk)

double f(double x[], int n)

{

double y;

int j;

y = 0.0;

for (j = 0; j < n; j = j+1)

{

y = y + x[j];

}

y = y;

return y;

}

//define function for Monte Carlo Multidimensional integration

double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)

{

double r, x[n], v;

int i, j;

r = 0.0;

v = 1.0;

// step 1: calculate the common factor V

#pragma omp for

for (j = 0; j < n; j = j+1)

{

v = v*(b[j]-a[j]);

}

// step 2: integration

#pragma omp for

for (i = 1; i <= m; i=i+1)

{

// calculate random x[] points

for (j = 0; j < n; j = j+1)

{

x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));

}

r = r + fn(x,n);

}

r = r*v/m;

return r;

}

double f(double[], int);

double int_mcnd(double(*)(double[],int), double[], double[], int, int);

int main(int argc, char **argv)

{

/* define how many integrals */

const int n = 10;

double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};

double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};

double result, mean;

int m;

unsigned long long int i, N;

int NumThreads = 4;

// initial seed value (use system time)

srand(time(NULL));

cout.precision(6);

cout.setf(ios::fixed | ios::showpoint);

// current time in seconds (begin calculations)

time_t seconds_i;

seconds_i = time (NULL);

m = 4; // initial number of intervals

// convert command-line input to N = number of points

N = atoi( argv[1] );

#pragma omp parallel private(result, mean) shared(N, m) num_threads(NumThreads)

for (i=0; i <=N/pow(4,i); i++)

{

result = int_mcnd(f, a, b, n, m);

mean = result/(pow(10,10));

#pragma omp master

cout << setw(30) << m/4 << setw(30) << result/4 << setw(30) << mean/4 <<endl;

m = m*4;

}

// current time in seconds (end of calculations)

time_t seconds_f;

seconds_f = time (NULL);

cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;

return 0;

}

I want only the master thread to output the values.

I compiled with:

`g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm`

your help and suggestion to fix the code is most appreciated. thanks a lot.

Answer

Let's see what your program does. At `omp parallel`

, your threads are spawned and they will execute the remaining code in parallel. Operations like:

```
m = m * 4;
```

Are undefined (and make no sense generally, as they are executed four times per iteration).

Further, when those threads encounter a `omp for`

, they will share the work of the loop, i.e. each iteration will be executed only once by some thread. Since `int_mcnd`

is executed within a `parallel`

region, all it's local variables are private. You have no construct in your code to actually collect those private results (also `result`

and `mean`

are private).

The correct approach is to use a parallel for loop with `reduction`

clause, indicating that there is a variable (`r`

/`v`

) that is being aggregated throughout the execution of the loop.

To allow this, the reduction variables need to be declared as shared, outside of the scope of the parallel region. The easiest solution is to move the parallel region inside of `int_mcnd`

. This also avoid the race condition for `m`

.

There is one more hurdle: `rand`

is using global state and at least my implementation is locked. Since most of the time is spent into `rand`

, your code would scale horribly. The solution is to use an explicit threadprivate state via `rand_r`

. (See also this question).

Piecing it together, the modified code looks like this:

```
double int_mcnd(double (*fn)(double[], int), double a[], double b[], int n, int m)
{
// Reduction variables need to be shared
double r = 0.0;
double v = 1.0;
#pragma omp parallel
// All variables declared inside are private
{
// step 1: calculate the common factor V
#pragma omp for reduction(* : v)
for (int j = 0; j < n; j = j + 1)
{
v = v * (b[j] - a[j]);
}
// step 2: integration
unsigned int private_seed = omp_get_thread_num();
#pragma omp for reduction(+ : r)
for (int i = 1; i <= m; i = i + 1)
{
// Note: X MUST be private, otherwise, you have race-conditions again
double x[n];
// calculate random x[] points
for (int j = 0; j < n; j = j + 1)
{
x[j] = a[j] + (rand_r(&private_seed)) / ((RAND_MAX / (b[j] - a[j])));
}
r = r + fn(x, n);
}
}
r = r * v / m;
return r;
}
double f(double[], int);
double int_mcnd(double (*)(double[], int), double[], double[], int, int);
int main(int argc, char** argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = { 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0 };
double a[n] = { -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0 };
int m;
unsigned long long int i, N;
int NumThreads = 4;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time(NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi(argv[1]);
for (i = 0; i <= N / pow(4, i); i++)
{
double result = int_mcnd(f, a, b, n, m);
double mean = result / (pow(10, 10));
cout << setw(30) << m << setw(30) << result << setw(30) << mean << endl;
m = m * 4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time(NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
```

Note that I removed the division by four, and also the output is done outside of the parallel region. The results should be similar (except for randomness of course) than the serial version.

I observe perfect 16x speedup on a 16 core system with `-O3`

.

A few more remarks:

Declare variables as locally as possible.

If thread overhead would be a problem, you could move the parallel region outside, but then you need to think more carefully about the parallel execution, and find a solution for the shared reduction variables. Given the embarrassingly parallel nature of Monte Carlo codes, you could stick more closely with your initial solution by removing the `omp for`

directives - which then means each thread executes *all* loop iterations. Then you could manually sum up the result variable and print that. But I don't really see the point.

Source (Stackoverflow)