Actual source code: ex69f.F90
1: #include <petsc/finclude/petscsys.h>
2: program ex69F90
4: ! Demonstrates two issues
5: !
6: ! A) How using mpiexec to start up a program can dramatically change
7: ! the OpenMP thread binding/mapping resulting in poor performance
8: !
9: ! Set the environmental variable with, for example,
10: ! export OMP_NUM_THREADS=4
11: ! Run this example on one MPI process three ways
12: ! ./ex69f
13: ! mpiexec -n 1 ./ex69f
14: ! mpiexec --bind-to numa -n 1 ./ex69f
15: !
16: ! You may get very different wall clock times
17: ! It seems some mpiexec implementations change the thread binding/mapping that results with
18: ! OpenMP so all the threads are run on a single core
19: !
20: ! The same differences occur without the PetscInitialize() call indicating
21: ! the binding change is done by the mpiexec, not the MPI_Init()
22: !
23: ! B) How cpu_time() may give unexpected results, much larger than expected,
24: ! even for code portions with no OpenMP
25: !
26: ! Note the CPU time for output of the second loop, it should equal the wallclock time
27: ! since the loop is not run in parallel (with OpenMP) but instead it may be listed as
28: ! many times higher
29: !
30: ! $ OMP_NUM_THREADS=8 ./ex69f (ifort compiler)
31: ! CPU time reported by cpu_time() 1.66649300000000
32: ! Wall clock time reported by system_clock() 0.273980000000000
33: ! Wall clock time reported by omp_get_wtime() 0.273979902267456
34: !
35: use petsc
36: implicit none
38: PetscErrorCode ierr
39: double precision cputime_start, cputime_end, wtime_start, wtime_end, omp_get_wtime
40: integer(kind=8) systime_start, systime_end, systime_rate
41: double precision x(100)
42: integer i, maxthreads, omp_get_max_threads
44: PetscCallA(PetscInitialize(ierr))
45: call system_clock(systime_start, systime_rate)
46: wtime_start = omp_get_wtime()
47: call cpu_time(cputime_start)
48: !$OMP PARALLEL DO
49: do i = 1, 100
50: x(i) = exp(3.0d0*i)
51: end do
52: call cpu_time(cputime_end)
53: call system_clock(systime_end, systime_rate)
54: wtime_end = omp_get_wtime()
55: print *, 'CPU time reported by cpu_time() ', cputime_end - cputime_start
56: print *, 'Wall clock time reported by system_clock() ', real(systime_end - systime_start, kind=8)/real(systime_rate, kind=8)
57: print *, 'Wall clock time reported by omp_get_wtime()', wtime_end - wtime_start
58: print *, 'Value of x(22)', x(22)
59: !$ maxthreads = omp_get_max_threads()
60: print *, 'Number of threads set', maxthreads
62: call system_clock(systime_start, systime_rate)
63: wtime_start = omp_get_wtime()
64: call cpu_time(cputime_start)
65: do i = 1, 100
66: x(i) = exp(3.0d0*i)
67: end do
68: call cpu_time(cputime_end)
69: call system_clock(systime_end, systime_rate)
70: wtime_end = omp_get_wtime()
71: print *, 'CPU time reported by cpu_time() ', cputime_end - cputime_start
72: print *, 'Wall clock time reported by system_clock() ', real(systime_end - systime_start, kind=8)/real(systime_rate, kind=8)
73: print *, 'Wall clock time reported by omp_get_wtime()', wtime_end - wtime_start
74: print *, 'Value of x(22)', x(22)
75: PetscCallA(PetscFinalize(ierr))
76: end program ex69F90
78: !/*TEST
79: !
80: ! build:
81: ! requires: openmp
82: !
83: ! test:
84: ! filter: grep -v "Number of threads"
85: !
86: !TEST*/