Actual source code: ex69f.F90

  1: program ex69F90

  3: !   Demonstrates two issues
  4: !
  5: !   A) How using mpiexec to start up a program can dramatically change
  6: !      the OpenMP thread binding/mapping resulting in poor performance
  7: !
  8: !      Set the environmental variable with, for example,
  9: !        export OMP_NUM_THREADS=4
 10: !      Run this example on one MPI process three ways
 11: !        ./ex69f
 12: !        mpiexec -n 1 ./ex69f
 13: !        mpiexec --bind-to numa -n 1 ./ex69f
 14: !
 15: !      You may get very different wall clock times
 16: !      It seems some mpiexec implementations change the thread binding/mapping that results with
 17: !      OpenMP so all the threads are run on a single core
 18: !
 19: !      The same differences occur without the PetscInitialize() call indicating
 20: !      the binding change is done by the mpiexec, not the MPI_Init()
 21: !
 22: !   B) How cpu_time() may give unexpected results, much larger than expected,
 23: !      even for code portions with no OpenMP
 24: !
 25: !      Note the CPU time for output of the second loop, it should equal the wallclock time
 26: !      since the loop is not run in parallel (with OpenMP) but instead it may be listed as
 27: !      many times higher
 28: !
 29: !     $ OMP_NUM_THREADS=8 ./ex69f (ifort compiler)
 30: !       CPU time reported by cpu_time()              1.66649300000000
 31: !       Wall clock time reported by system_clock()   0.273980000000000
 32: !       Wall clock time reported by omp_get_wtime()  0.273979902267456
 33: !
 34: #include <petsc/finclude/petscsys.h>
 35:   use petsc
 36:   implicit none

 38:   PetscErrorCode ierr
 39:   double precision cputime_start, cputime_end, wtime_start, wtime_end, omp_get_wtime
 40:   integer(kind=8) systime_start, systime_end, systime_rate
 41:   double precision x(100)
 42:   integer i, maxthreads, omp_get_max_threads

 44:   PetscCallA(PetscInitialize(ierr))
 45:   call system_clock(systime_start, systime_rate)
 46:   wtime_start = omp_get_wtime()
 47:   call cpu_time(cputime_start)
 48: !$OMP PARALLEL DO
 49:   do i = 1, 100
 50:     x(i) = exp(3.0d0*i)
 51:   end do
 52:   call cpu_time(cputime_end)
 53:   call system_clock(systime_end, systime_rate)
 54:   wtime_end = omp_get_wtime()
 55:   print *, 'CPU time reported by cpu_time()            ', cputime_end - cputime_start
 56:   print *, 'Wall clock time reported by system_clock() ', real(systime_end - systime_start, kind=8)/real(systime_rate, kind=8)
 57:   print *, 'Wall clock time reported by omp_get_wtime()', wtime_end - wtime_start
 58:   print *, 'Value of x(22)', x(22)
 59: !$ maxthreads = omp_get_max_threads()
 60:   print *, 'Number of threads set', maxthreads

 62:   call system_clock(systime_start, systime_rate)
 63:   wtime_start = omp_get_wtime()
 64:   call cpu_time(cputime_start)
 65:   do i = 1, 100
 66:     x(i) = exp(3.0d0*i)
 67:   end do
 68:   call cpu_time(cputime_end)
 69:   call system_clock(systime_end, systime_rate)
 70:   wtime_end = omp_get_wtime()
 71:   print *, 'CPU time reported by cpu_time()            ', cputime_end - cputime_start
 72:   print *, 'Wall clock time reported by system_clock() ', real(systime_end - systime_start, kind=8)/real(systime_rate, kind=8)
 73:   print *, 'Wall clock time reported by omp_get_wtime()', wtime_end - wtime_start
 74:   print *, 'Value of x(22)', x(22)
 75:   PetscCallA(PetscFinalize(ierr))
 76: end program ex69F90

 78: !/*TEST
 79: !
 80: !   build:
 81: !     requires: openmp
 82: !
 83: !   test:
 84: !     filter: grep -v "Number of threads"
 85: !
 86: !TEST*/