Actual source code: mpishm.c

  1: #include <petscsys.h>
  2: #include <petsc/private/petscimpl.h>

  4: struct _n_PetscShmComm {
  5:   PetscMPIInt *globranks;         /* global ranks of each rank in the shared memory communicator */
  6:   PetscMPIInt  shmsize;           /* size of the shared memory communicator */
  7:   MPI_Comm     globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
  8: };

 10: /*
 11:    Private routine to delete internal shared memory communicator when a communicator is freed.

 13:    This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this  data as an attribute is freed.

 15:    Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()

 17: */
 18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
 19: {
 20:   PetscShmComm p = (PetscShmComm)val;

 22:   PetscFunctionBegin;
 23:   PetscCallReturnMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
 24:   PetscCallMPIReturnMPI(MPI_Comm_free(&p->shmcomm));
 25:   PetscCallReturnMPI(PetscFree(p->globranks));
 26:   PetscCallReturnMPI(PetscFree(val));
 27:   PetscFunctionReturn(MPI_SUCCESS);
 28: }

 30: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 31:   /* Data structures to support freeing comms created in PetscShmCommGet().
 32:   Since we predict communicators passed to PetscShmCommGet() are very likely
 33:   either a petsc inner communicator or an MPI communicator with a linked petsc
 34:   inner communicator, we use a simple static array to store dupped communicators
 35:   on rare cases otherwise.
 36:  */
 37:   #define MAX_SHMCOMM_DUPPED_COMMS 16
 38: static PetscInt       num_dupped_comms = 0;
 39: static MPI_Comm       shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
 40: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
 41: {
 42:   PetscFunctionBegin;
 43:   for (PetscInt i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
 44:   num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
 45:   PetscFunctionReturn(PETSC_SUCCESS);
 46: }
 47: #endif

 49: /*@C
 50:   PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory

 52:   Collective.

 54:   Input Parameter:
 55: . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`

 57:   Output Parameter:
 58: . pshmcomm - the PETSc shared memory communicator object

 60:   Level: developer

 62:   Note:
 63:   When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`

 65: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
 66: @*/
 67: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
 68: {
 69: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
 70:   MPI_Group         globgroup, shmgroup;
 71:   PetscMPIInt      *shmranks, i, flg;
 72:   PetscCommCounter *counter;

 74:   PetscFunctionBegin;
 75:   PetscAssertPointer(pshmcomm, 2);
 76:   /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
 77:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
 78:   if (!flg) { /* globcomm is not a petsc comm */
 79:     union
 80:     {
 81:       MPI_Comm comm;
 82:       void    *ptr;
 83:     } ucomm;
 84:     /* check if globcomm already has a linked petsc inner comm */
 85:     PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
 86:     if (!flg) {
 87:       /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
 88:       PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
 89:       PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
 90:       /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
 91:       if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
 92:       shmcomm_dupped_comms[num_dupped_comms] = globcomm;
 93:       num_dupped_comms++;
 94:     } else {
 95:       /* otherwise, we pull out the inner comm and use it as globcomm */
 96:       globcomm = ucomm.comm;
 97:     }
 98:   }

100:   /* Check if globcomm already has an attached pshmcomm. If no, create one */
101:   PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
102:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);

104:   PetscCall(PetscNew(pshmcomm));
105:   (*pshmcomm)->globcomm = globcomm;

107:   PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));

109:   PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
110:   PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
111:   PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
112:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
113:   PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
114:   for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
115:   PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
116:   PetscCall(PetscFree(shmranks));
117:   PetscCallMPI(MPI_Group_free(&globgroup));
118:   PetscCallMPI(MPI_Group_free(&shmgroup));

120:   for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
121:   PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
122:   PetscFunctionReturn(PETSC_SUCCESS);
123: #else
124:   SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
125: #endif
126: }

128: /*@C
129:   PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator

131:   Input Parameters:
132: + pshmcomm - the shared memory communicator object
133: - grank    - the global rank

135:   Output Parameter:
136: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist

138:   Level: developer

140:   Developer Notes:
141:   Assumes the pshmcomm->globranks[] is sorted

143:   It may be better to rewrite this to map multiple global ranks to local in the same function call

145: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
146: @*/
147: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
148: {
149:   PetscMPIInt low, high, t, i;
150:   PetscBool   flg = PETSC_FALSE;

152:   PetscFunctionBegin;
153:   PetscAssertPointer(pshmcomm, 1);
154:   PetscAssertPointer(lrank, 3);
155:   *lrank = MPI_PROC_NULL;
156:   if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
157:   if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
158:   PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
159:   if (flg) PetscFunctionReturn(PETSC_SUCCESS);
160:   low  = 0;
161:   high = pshmcomm->shmsize;
162:   while (high - low > 5) {
163:     t = (low + high) / 2;
164:     if (pshmcomm->globranks[t] > grank) high = t;
165:     else low = t;
166:   }
167:   for (i = low; i < high; i++) {
168:     if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
169:     if (pshmcomm->globranks[i] == grank) {
170:       *lrank = i;
171:       PetscFunctionReturn(PETSC_SUCCESS);
172:     }
173:   }
174:   PetscFunctionReturn(PETSC_SUCCESS);
175: }

177: /*@C
178:   PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank

180:   Input Parameters:
181: + pshmcomm - the shared memory communicator object
182: - lrank    - the local rank in the shared memory communicator

184:   Output Parameter:
185: . grank - the global rank in the global communicator where the shared memory communicator is built

187:   Level: developer

189: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
190: @*/
191: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
192: {
193:   PetscFunctionBegin;
194:   PetscAssertPointer(pshmcomm, 1);
195:   PetscAssertPointer(grank, 3);
196:   PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
197:   *grank = pshmcomm->globranks[lrank];
198:   PetscFunctionReturn(PETSC_SUCCESS);
199: }

201: /*@C
202:   PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory

204:   Input Parameter:
205: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()

207:   Output Parameter:
208: . comm - the MPI communicator

210:   Level: developer

212: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
213: @*/
214: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
215: {
216:   PetscFunctionBegin;
217:   PetscAssertPointer(pshmcomm, 1);
218:   PetscAssertPointer(comm, 2);
219:   *comm = pshmcomm->shmcomm;
220:   PetscFunctionReturn(PETSC_SUCCESS);
221: }

223: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
224:   #include <pthread.h>
225:   #include <hwloc.h>
226:   #include <omp.h>

228:   /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
229:    otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
230:    simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
231:    by 50%. Until the reason is found out, we use mmap() instead.
232: */
233:   #define USE_MMAP_ALLOCATE_SHARED_MEMORY

235:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
236:     #include <sys/mman.h>
237:     #include <sys/types.h>
238:     #include <sys/stat.h>
239:     #include <fcntl.h>
240:   #endif

242: struct _n_PetscOmpCtrl {
243:   MPI_Comm           omp_comm;        /* a shared memory communicator to spawn omp threads */
244:   MPI_Comm           omp_master_comm; /* a communicator to give to third party libraries */
245:   PetscMPIInt        omp_comm_size;   /* size of omp_comm, a kind of OMP_NUM_THREADS */
246:   PetscBool          is_omp_master;   /* rank 0's in omp_comm */
247:   MPI_Win            omp_win;         /* a shared memory window containing a barrier */
248:   pthread_barrier_t *barrier;         /* pointer to the barrier */
249:   hwloc_topology_t   topology;
250:   hwloc_cpuset_t     cpuset;     /* cpu bindings of omp master */
251:   hwloc_cpuset_t     omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
252: };

254: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
255:    contained by the controller.

257:    PETSc OpenMP controller users do not call this function directly. This function exists
258:    only because we want to separate shared memory allocation methods from other code.
259:  */
260: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
261: {
262:   MPI_Aint              size;
263:   void                 *baseptr;
264:   pthread_barrierattr_t attr;

266:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
267:   int       fd;
268:   PetscChar pathname[PETSC_MAX_PATH_LEN];
269:   #else
270:   PetscMPIInt disp_unit;
271:   #endif

273:   PetscFunctionBegin;
274:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
275:   size = sizeof(pthread_barrier_t);
276:   if (ctrl->is_omp_master) {
277:     /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
278:     PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
279:     PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
280:     /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
281:     fd = mkstemp(pathname);
282:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
283:     PetscCallExternal(ftruncate, fd, size);
284:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
285:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
286:     PetscCallExternal(close, fd);
287:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
288:     /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
289:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
290:     PetscCallExternal(unlink, pathname);
291:   } else {
292:     PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
293:     fd = open(pathname, O_RDWR);
294:     PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
295:     baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
296:     PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
297:     PetscCallExternal(close, fd);
298:     PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
299:   }
300:   #else
301:   size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
302:   PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
303:   PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
304:   #endif
305:   ctrl->barrier = (pthread_barrier_t *)baseptr;

307:   /* omp master initializes the barrier */
308:   if (ctrl->is_omp_master) {
309:     PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
310:     PetscCallExternal(pthread_barrierattr_init, &attr);
311:     PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
312:     PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
313:     PetscCallExternal(pthread_barrierattr_destroy, &attr);
314:   }

316:   /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
317:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
318:   PetscFunctionReturn(PETSC_SUCCESS);
319: }

321: /* Destroy the pthread barrier in the PETSc OpenMP controller */
322: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
323: {
324:   PetscFunctionBegin;
325:   /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
326:   PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
327:   if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);

329:   #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
330:   PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
331:   #else
332:   PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
333:   #endif
334:   PetscFunctionReturn(PETSC_SUCCESS);
335: }

337: /*@C
338:     PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP

340:     Input Parameters:
341: +   petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
342: -   nthreads   - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value

344:     Output Parameter:
345: .   pctrl      - a PETSc OpenMP controller

347:     Level: developer

349:     Developer Note:
350:     Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use

352: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
353: @*/
354: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
355: {
356:   PetscOmpCtrl   ctrl;
357:   unsigned long *cpu_ulongs = NULL;
358:   PetscShmComm   pshmcomm;
359:   MPI_Comm       shm_comm;
360:   PetscMPIInt    shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
361:   PetscInt       num_packages, num_cores;

363:   PetscFunctionBegin;
364:   PetscCall(PetscNew(&ctrl));

366:   /*=================================================================================
367:     Init hwloc
368:    ==================================================================================*/
369:   PetscCallExternal(hwloc_topology_init, &ctrl->topology);
370:   #if HWLOC_API_VERSION >= 0x00020000
371:   /* to filter out unneeded info and have faster hwloc_topology_load */
372:   PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
373:   PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
374:   #endif
375:   PetscCallExternal(hwloc_topology_load, ctrl->topology);

377:   /*=================================================================================
378:     Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
379:     physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
380:     others are called slaves. OMP Masters make up a new comm called omp_master_comm,
381:     which is usually passed to third party libraries.
382:    ==================================================================================*/

384:   /* fetch the stored shared memory communicator */
385:   PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
386:   PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));

388:   PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
389:   PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));

391:   /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
392:   if (nthreads == -1) {
393:     num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
394:     num_cores    = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
395:     nthreads     = num_cores / num_packages;
396:     if (nthreads > shm_comm_size) nthreads = shm_comm_size;
397:   }

399:   PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
400:   if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));

402:   /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
403:      shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
404:      color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
405:      be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
406:      Use 0 as key so that rank ordering wont change in new comm.
407:    */
408:   color = shm_rank / nthreads;
409:   PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));

411:   /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
412:   PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
413:   if (!omp_rank) {
414:     ctrl->is_omp_master = PETSC_TRUE; /* master */
415:     color               = 0;
416:   } else {
417:     ctrl->is_omp_master = PETSC_FALSE;   /* slave */
418:     color               = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
419:   }
420:   PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));

422:   /*=================================================================================
423:     Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
424:     slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
425:     and run them on the idle CPUs.
426:    ==================================================================================*/
427:   PetscCall(PetscOmpCtrlCreateBarrier(ctrl));

429:   /*=================================================================================
430:     omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
431:     is the union of the bindings of all ranks in the omp_comm
432:     =================================================================================*/

434:   ctrl->cpuset = hwloc_bitmap_alloc();
435:   PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
436:   PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);

438:   /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
439:   nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
440:   PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
441:   if (nr_cpu_ulongs == 1) {
442:     cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
443:   } else {
444:     for (PetscInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
445:   }

447:   PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));

449:   if (ctrl->is_omp_master) {
450:     ctrl->omp_cpuset = hwloc_bitmap_alloc();
451:     PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
452:     if (nr_cpu_ulongs == 1) {
453:   #if HWLOC_API_VERSION >= 0x00020000
454:       PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
455:   #else
456:       hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
457:   #endif
458:     } else {
459:       for (PetscInt i = 0; i < nr_cpu_ulongs; i++) {
460:   #if HWLOC_API_VERSION >= 0x00020000
461:         PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
462:   #else
463:         hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
464:   #endif
465:       }
466:     }
467:   }
468:   PetscCall(PetscFree(cpu_ulongs));
469:   *pctrl = ctrl;
470:   PetscFunctionReturn(PETSC_SUCCESS);
471: }

473: /*@C
474:     PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller

476:     Input Parameter:
477: .   pctrl  - a PETSc OpenMP controller

479:     Level: developer

481: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
482: @*/
483: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
484: {
485:   PetscOmpCtrl ctrl = *pctrl;

487:   PetscFunctionBegin;
488:   hwloc_bitmap_free(ctrl->cpuset);
489:   hwloc_topology_destroy(ctrl->topology);
490:   PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
491:   PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
492:   if (ctrl->is_omp_master) {
493:     hwloc_bitmap_free(ctrl->omp_cpuset);
494:     PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
495:   }
496:   PetscCall(PetscFree(ctrl));
497:   PetscFunctionReturn(PETSC_SUCCESS);
498: }

500: /*@C
501:     PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller

503:     Input Parameter:
504: .   ctrl - a PETSc OMP controller

506:     Output Parameters:
507: +   omp_comm         - a communicator that includes a master rank and slave ranks where master spawns threads
508: .   omp_master_comm  - on master ranks, return a communicator that include master ranks of each omp_comm;
509:                        on slave ranks, `MPI_COMM_NULL` will be return in reality.
510: -   is_omp_master    - true if the calling process is an OMP master rank.

512:     Note:
513:     Any output parameter can be `NULL`. The parameter is just ignored.

515:     Level: developer

517: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
518: @*/
519: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
520: {
521:   PetscFunctionBegin;
522:   if (omp_comm) *omp_comm = ctrl->omp_comm;
523:   if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
524:   if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
525:   PetscFunctionReturn(PETSC_SUCCESS);
526: }

528: /*@C
529:     PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)

531:     Input Parameter:
532: .   ctrl - a PETSc OMP controller

534:     Notes:
535:     this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
536:     require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
537:     MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.

539:     A code using `PetscOmpCtrlBarrier()` would be like this,
540: .vb
541:     if (is_omp_master) {
542:       PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
543:       Call the library using OpenMP
544:       PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
545:     }
546:     PetscOmpCtrlBarrier(ctrl);
547: .ve

549:     Level: developer

551: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
552: @*/
553: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
554: {
555:   int err;

557:   PetscFunctionBegin;
558:   err = pthread_barrier_wait(ctrl->barrier);
559:   PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
560:   PetscFunctionReturn(PETSC_SUCCESS);
561: }

563: /*@C
564:     PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks

566:     Input Parameter:
567: .   ctrl - a PETSc OMP controller

569:     Note:
570:     Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
571:     This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime

573:     Level: developer

575: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
576: @*/
577: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
578: {
579:   PetscFunctionBegin;
580:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
581:   omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
582:   PetscFunctionReturn(PETSC_SUCCESS);
583: }

585: /*@C
586:    PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks

588:    Input Parameter:
589: .  ctrl - a PETSc OMP controller

591:    Note:
592:    Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
593:    This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.

595:    Level: developer

597: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
598: @*/
599: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
600: {
601:   PetscFunctionBegin;
602:   PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
603:   omp_set_num_threads(1);
604:   PetscFunctionReturn(PETSC_SUCCESS);
605: }

607:   #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
608: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */