Actual source code: mpmpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>
  3: #include <pthread.h>
  4: #include <hwloc.h>
  5: #include <omp.h>

  7: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
  8:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
  9:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
 10:    by 50%. Until the reason is found out, we use mmap() instead.
 11: */
 12: #define USE_MMAP_ALLOCATE_SHARED_MEMORY

 14: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
 15:   #include <sys/mman.h>
 16:   #include <sys/types.h>
 17:   #include <sys/stat.h>
 18:   #include <fcntl.h>
 19: #endif

 21: struct _n_PetscOmpCtrl {
 22:   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
 23:   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
 24:   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
 25:   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
 26:   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
 27:   pthread_barrier_t *barrier;         /* pointer to the barrier */
 28:   hwloc_topology_t   topology;
 29:   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
 30:   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
 31: };

 33: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
 34:    contained by the controller.

 36:    PETSc OpenMP controller users do not call this function directly. This function exists
 37:    only because we want to separate shared memory allocation methods from other code.
 38:  */
 39: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
 40: {
 41:   MPI_Aint              size;
 42:   void                 *baseptr;
 43:   pthread_barrierattr_t attr;

 45: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
 46:   int  fd;
 47:   char pathname[PETSC_MAX_PATH_LEN];
 48: #else
 49:   PetscMPIInt disp_unit;
 50: #endif

 52:   PetscFunctionBegin;
 53: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
 54:   size = sizeof(pthread_barrier_t);
 55:   if (ctrl->is_omp_master) {
 56:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
 57:     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
 58:     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
 59:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
 60:     fd = mkstemp(pathname);
 61:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
 62:     PetscCallExternal(ftruncate, fd, size);
 63:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 64:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
 65:     PetscCallExternal(close, fd);
 66:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
 67:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
 68:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
 69:     PetscCallExternal(unlink, pathname);
 70:   } else {
 71:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
 72:     fd = open(pathname, O_RDWR);
 73:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
 74:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 75:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
 76:     PetscCallExternal(close, fd);
 77:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
 78:   }
 79: #else
 80:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
 81:   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
 82:   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
 83: #endif
 84:   ctrl->barrier = (pthread_barrier_t *)baseptr;

 86:   /* omp master initializes the barrier */
 87:   if (ctrl->is_omp_master) {
 88:     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
 89:     PetscCallExternal(pthread_barrierattr_init, &attr);
 90:     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
 91:     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
 92:     PetscCallExternal(pthread_barrierattr_destroy, &attr);
 93:   }

 95:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
 96:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
 97:   PetscFunctionReturn(PETSC_SUCCESS);
 98: }

100: /* Destroy the pthread barrier in the PETSc OpenMP controller */
101: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
102: {
103:   PetscFunctionBegin;
104:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
105:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
106:   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);

108: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
109:   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
110: #else
111:   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
112: #endif
113:   PetscFunctionReturn(PETSC_SUCCESS);
114: }

116: /*@C
117:   PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP

119:   Input Parameters:
120: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
121: - nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

123:   Output Parameter:
124: . pctrl - a PETSc OpenMP controller

126:   Level: developer

128:   Developer Note:
129:   Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use

131: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
132: @*/
133: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
134: {
135:   PetscOmpCtrl   ctrl;
136:   unsigned long *cpu_ulongs = NULL;
137:   PetscShmComm   pshmcomm;
138:   MPI_Comm       shm_comm;
139:   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
140:   PetscInt       num_packages, num_cores;

142:   PetscFunctionBegin;
143:   PetscCall(PetscNew(&ctrl));

145:   /*=================================================================================
146:     Init hwloc
147:    ==================================================================================*/
148:   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
149: #if HWLOC_API_VERSION >= 0x00020000
150:   /* to filter out unneeded info and have faster hwloc_topology_load */
151:   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
152:   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
153: #endif
154:   PetscCallExternal(hwloc_topology_load, ctrl->topology);

156:   /*=================================================================================
157:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
158:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
159:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
160:     which is usually passed to third party libraries.
161:    ==================================================================================*/

163:   /* fetch the stored shared memory communicator */
164:   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
165:   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));

167:   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
168:   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));

170:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
171:   if (nthreads == -1) {
172:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
173:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
174:     nthreads     = num_cores / num_packages;
175:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
176:   }

178:   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
179:   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));

181:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
182:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
183:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
184:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
185:      Use 0 as key so that rank ordering wont change in new comm.
186:    */
187:   color = shm_rank / nthreads;
188:   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));

190:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
191:   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
192:   if (!omp_rank) {
193:     ctrl->is_omp_master = PETSC_TRUE; /* master */
194:     color               = 0;
195:   } else {
196:     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
197:     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
198:   }
199:   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));

201:   /*=================================================================================
202:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
203:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
204:     and run them on the idle CPUs.
205:    ==================================================================================*/
206:   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));

208:   /*=================================================================================
209:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
210:     is the union of the bindings of all ranks in the omp_comm
211:     =================================================================================*/

213:   ctrl->cpuset = hwloc_bitmap_alloc();
214:   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
215:   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

217:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
218:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
219:   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
220:   if (nr_cpu_ulongs == 1) {
221:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
222:   } else {
223:     for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
224:   }

226:   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));

228:   if (ctrl->is_omp_master) {
229:     ctrl->omp_cpuset = hwloc_bitmap_alloc();
230:     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
231:     if (nr_cpu_ulongs == 1) {
232: #if HWLOC_API_VERSION >= 0x00020000
233:       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
234: #else
235:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
236: #endif
237:     } else {
238:       for (PetscMPIInt i = 0; i < nr_cpu_ulongs; i++) {
239: #if HWLOC_API_VERSION >= 0x00020000
240:         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
241: #else
242:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
243: #endif
244:       }
245:     }
246:   }
247:   PetscCall(PetscFree(cpu_ulongs));
248:   *pctrl = ctrl;
249:   PetscFunctionReturn(PETSC_SUCCESS);
250: }

252: /*@C
253:   PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

255:   Input Parameter:
256: . pctrl - a PETSc OpenMP controller

258:   Level: developer

260: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
261: @*/
262: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
263: {
264:   PetscOmpCtrl ctrl = *pctrl;

266:   PetscFunctionBegin;
267:   hwloc_bitmap_free(ctrl->cpuset);
268:   hwloc_topology_destroy(ctrl->topology);
269:   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
270:   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
271:   if (ctrl->is_omp_master) {
272:     hwloc_bitmap_free(ctrl->omp_cpuset);
273:     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
274:   }
275:   PetscCall(PetscFree(ctrl));
276:   PetscFunctionReturn(PETSC_SUCCESS);
277: }

279: /*@C
280:   PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

282:   Input Parameter:
283: . ctrl - a PETSc OMP controller

285:   Output Parameters:
286: + omp_comm        - a communicator that includes a master rank and slave ranks where master spawns threads
287: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
288:                     on slave ranks, `MPI_COMM_NULL` will be return in reality.
289: - is_omp_master   - true if the calling process is an OMP master rank.

291:   Note:
292:   Any output parameter can be `NULL`. The parameter is just ignored.

294:   Level: developer

296: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
297: @*/
298: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
299: {
300:   PetscFunctionBegin;
301:   if (omp_comm) *omp_comm = ctrl->omp_comm;
302:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
303:   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
304:   PetscFunctionReturn(PETSC_SUCCESS);
305: }

307: /*@C
308:   PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

310:   Input Parameter:
311: . ctrl - a PETSc OMP controller

313:   Notes:
314:   This is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
315:   require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
316:   MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

318:   A code using `PetscOmpCtrlBarrier()` would be like this,
319: .vb
320:   if (is_omp_master) {
321:     PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
322:     Call the library using OpenMP
323:     PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
324:   }
325:   PetscOmpCtrlBarrier(ctrl);
326: .ve

328:   Level: developer

330: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
331: @*/
332: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
333: {
334:   int err;

336:   PetscFunctionBegin;
337:   err = pthread_barrier_wait(ctrl->barrier);
338:   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }

342: /*@C
343:   PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

345:   Input Parameter:
346: . ctrl - a PETSc OMP controller

348:   Note:
349:   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
350:   This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

352:   Level: developer

354: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
355: @*/
356: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
357: {
358:   PetscFunctionBegin;
359:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
360:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
361:   PetscFunctionReturn(PETSC_SUCCESS);
362: }

364: /*@C
365:   PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

367:   Input Parameter:
368: . ctrl - a PETSc OMP controller

370:   Note:
371:   Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
372:   This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

374:   Level: developer

376: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
377: @*/
378: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
379: {
380:   PetscFunctionBegin;
381:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
382:   omp_set_num_threads(1);
383:   PetscFunctionReturn(PETSC_SUCCESS);
384: }

386: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY