Actual source code: mpishm.c
1: #include <petscsys.h>
2: #include <petsc/private/petscimpl.h>
4: struct _n_PetscShmComm {
5: PetscMPIInt *globranks; /* global ranks of each rank in the shared memory communicator */
6: PetscMPIInt shmsize; /* size of the shared memory communicator */
7: MPI_Comm globcomm, shmcomm; /* global communicator and shared memory communicator (a sub-communicator of the former) */
8: };
10: /*
11: Private routine to delete internal shared memory communicator when a communicator is freed.
13: This is called by MPI, not by users. This is called by MPI_Comm_free() when the communicator that has this data as an attribute is freed.
15: Note: this is declared extern "C" because it is passed to MPI_Comm_create_keyval()
17: */
18: PETSC_EXTERN PetscMPIInt MPIAPI Petsc_ShmComm_Attr_DeleteFn(MPI_Comm comm, PetscMPIInt keyval, void *val, void *extra_state)
19: {
20: PetscShmComm p = (PetscShmComm)val;
22: PetscFunctionBegin;
23: PetscCallReturnMPI(PetscInfo(NULL, "Deleting shared memory subcommunicator in a MPI_Comm %ld\n", (long)comm));
24: PetscCallMPIReturnMPI(MPI_Comm_free(&p->shmcomm));
25: PetscCallReturnMPI(PetscFree(p->globranks));
26: PetscCallReturnMPI(PetscFree(val));
27: PetscFunctionReturn(MPI_SUCCESS);
28: }
30: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
31: /* Data structures to support freeing comms created in PetscShmCommGet().
32: Since we predict communicators passed to PetscShmCommGet() are very likely
33: either a petsc inner communicator or an MPI communicator with a linked petsc
34: inner communicator, we use a simple static array to store dupped communicators
35: on rare cases otherwise.
36: */
37: #define MAX_SHMCOMM_DUPPED_COMMS 16
38: static PetscInt num_dupped_comms = 0;
39: static MPI_Comm shmcomm_dupped_comms[MAX_SHMCOMM_DUPPED_COMMS];
40: static PetscErrorCode PetscShmCommDestroyDuppedComms(void)
41: {
42: PetscFunctionBegin;
43: for (PetscInt i = 0; i < num_dupped_comms; i++) PetscCall(PetscCommDestroy(&shmcomm_dupped_comms[i]));
44: num_dupped_comms = 0; /* reset so that PETSc can be reinitialized */
45: PetscFunctionReturn(PETSC_SUCCESS);
46: }
47: #endif
49: /*@C
50: PetscShmCommGet - Returns a sub-communicator of all ranks that share a common memory
52: Collective.
54: Input Parameter:
55: . globcomm - `MPI_Comm`, which can be a user `MPI_Comm` or a PETSc inner `MPI_Comm`
57: Output Parameter:
58: . pshmcomm - the PETSc shared memory communicator object
60: Level: developer
62: Note:
63: When used with MPICH, MPICH must be configured with `--download-mpich-device=ch3:nemesis`
65: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
66: @*/
67: PetscErrorCode PetscShmCommGet(MPI_Comm globcomm, PetscShmComm *pshmcomm)
68: {
69: #ifdef PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY
70: MPI_Group globgroup, shmgroup;
71: PetscMPIInt *shmranks, i, flg;
72: PetscCommCounter *counter;
74: PetscFunctionBegin;
75: PetscAssertPointer(pshmcomm, 2);
76: /* Get a petsc inner comm, since we always want to stash pshmcomm on petsc inner comms */
77: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_Counter_keyval, &counter, &flg));
78: if (!flg) { /* globcomm is not a petsc comm */
79: union
80: {
81: MPI_Comm comm;
82: void *ptr;
83: } ucomm;
84: /* check if globcomm already has a linked petsc inner comm */
85: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_InnerComm_keyval, &ucomm, &flg));
86: if (!flg) {
87: /* globcomm does not have a linked petsc inner comm, so we create one and replace globcomm with it */
88: PetscCheck(num_dupped_comms < MAX_SHMCOMM_DUPPED_COMMS, globcomm, PETSC_ERR_PLIB, "PetscShmCommGet() is trying to dup more than %d MPI_Comms", MAX_SHMCOMM_DUPPED_COMMS);
89: PetscCall(PetscCommDuplicate(globcomm, &globcomm, NULL));
90: /* Register a function to free the dupped petsc comms at PetscFinalize at the first time */
91: if (num_dupped_comms == 0) PetscCall(PetscRegisterFinalize(PetscShmCommDestroyDuppedComms));
92: shmcomm_dupped_comms[num_dupped_comms] = globcomm;
93: num_dupped_comms++;
94: } else {
95: /* otherwise, we pull out the inner comm and use it as globcomm */
96: globcomm = ucomm.comm;
97: }
98: }
100: /* Check if globcomm already has an attached pshmcomm. If no, create one */
101: PetscCallMPI(MPI_Comm_get_attr(globcomm, Petsc_ShmComm_keyval, pshmcomm, &flg));
102: if (flg) PetscFunctionReturn(PETSC_SUCCESS);
104: PetscCall(PetscNew(pshmcomm));
105: (*pshmcomm)->globcomm = globcomm;
107: PetscCallMPI(MPI_Comm_split_type(globcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(*pshmcomm)->shmcomm));
109: PetscCallMPI(MPI_Comm_size((*pshmcomm)->shmcomm, &(*pshmcomm)->shmsize));
110: PetscCallMPI(MPI_Comm_group(globcomm, &globgroup));
111: PetscCallMPI(MPI_Comm_group((*pshmcomm)->shmcomm, &shmgroup));
112: PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &shmranks));
113: PetscCall(PetscMalloc1((*pshmcomm)->shmsize, &(*pshmcomm)->globranks));
114: for (i = 0; i < (*pshmcomm)->shmsize; i++) shmranks[i] = i;
115: PetscCallMPI(MPI_Group_translate_ranks(shmgroup, (*pshmcomm)->shmsize, shmranks, globgroup, (*pshmcomm)->globranks));
116: PetscCall(PetscFree(shmranks));
117: PetscCallMPI(MPI_Group_free(&globgroup));
118: PetscCallMPI(MPI_Group_free(&shmgroup));
120: for (i = 0; i < (*pshmcomm)->shmsize; i++) PetscCall(PetscInfo(NULL, "Shared memory rank %d global rank %d\n", i, (*pshmcomm)->globranks[i]));
121: PetscCallMPI(MPI_Comm_set_attr(globcomm, Petsc_ShmComm_keyval, *pshmcomm));
122: PetscFunctionReturn(PETSC_SUCCESS);
123: #else
124: SETERRQ(globcomm, PETSC_ERR_SUP, "Shared memory communicators need MPI-3 package support.\nPlease upgrade your MPI or reconfigure with --download-mpich.");
125: #endif
126: }
128: /*@C
129: PetscShmCommGlobalToLocal - Given a global rank returns the local rank in the shared memory communicator
131: Input Parameters:
132: + pshmcomm - the shared memory communicator object
133: - grank - the global rank
135: Output Parameter:
136: . lrank - the local rank, or `MPI_PROC_NULL` if it does not exist
138: Level: developer
140: Developer Notes:
141: Assumes the pshmcomm->globranks[] is sorted
143: It may be better to rewrite this to map multiple global ranks to local in the same function call
145: .seealso: `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`, `PetscShmCommGetMpiShmComm()`
146: @*/
147: PetscErrorCode PetscShmCommGlobalToLocal(PetscShmComm pshmcomm, PetscMPIInt grank, PetscMPIInt *lrank)
148: {
149: PetscMPIInt low, high, t, i;
150: PetscBool flg = PETSC_FALSE;
152: PetscFunctionBegin;
153: PetscAssertPointer(pshmcomm, 1);
154: PetscAssertPointer(lrank, 3);
155: *lrank = MPI_PROC_NULL;
156: if (grank < pshmcomm->globranks[0]) PetscFunctionReturn(PETSC_SUCCESS);
157: if (grank > pshmcomm->globranks[pshmcomm->shmsize - 1]) PetscFunctionReturn(PETSC_SUCCESS);
158: PetscCall(PetscOptionsGetBool(NULL, NULL, "-noshared", &flg, NULL));
159: if (flg) PetscFunctionReturn(PETSC_SUCCESS);
160: low = 0;
161: high = pshmcomm->shmsize;
162: while (high - low > 5) {
163: t = (low + high) / 2;
164: if (pshmcomm->globranks[t] > grank) high = t;
165: else low = t;
166: }
167: for (i = low; i < high; i++) {
168: if (pshmcomm->globranks[i] > grank) PetscFunctionReturn(PETSC_SUCCESS);
169: if (pshmcomm->globranks[i] == grank) {
170: *lrank = i;
171: PetscFunctionReturn(PETSC_SUCCESS);
172: }
173: }
174: PetscFunctionReturn(PETSC_SUCCESS);
175: }
177: /*@C
178: PetscShmCommLocalToGlobal - Given a local rank in the shared memory communicator returns the global rank
180: Input Parameters:
181: + pshmcomm - the shared memory communicator object
182: - lrank - the local rank in the shared memory communicator
184: Output Parameter:
185: . grank - the global rank in the global communicator where the shared memory communicator is built
187: Level: developer
189: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommGetMpiShmComm()`
190: @*/
191: PetscErrorCode PetscShmCommLocalToGlobal(PetscShmComm pshmcomm, PetscMPIInt lrank, PetscMPIInt *grank)
192: {
193: PetscFunctionBegin;
194: PetscAssertPointer(pshmcomm, 1);
195: PetscAssertPointer(grank, 3);
196: PetscCheck(lrank >= 0 && lrank < pshmcomm->shmsize, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "No rank %d in the shared memory communicator", lrank);
197: *grank = pshmcomm->globranks[lrank];
198: PetscFunctionReturn(PETSC_SUCCESS);
199: }
201: /*@C
202: PetscShmCommGetMpiShmComm - Returns the MPI communicator that represents all processes with common shared memory
204: Input Parameter:
205: . pshmcomm - PetscShmComm object obtained with PetscShmCommGet()
207: Output Parameter:
208: . comm - the MPI communicator
210: Level: developer
212: .seealso: `PetscShmCommGlobalToLocal()`, `PetscShmCommGet()`, `PetscShmCommLocalToGlobal()`
213: @*/
214: PetscErrorCode PetscShmCommGetMpiShmComm(PetscShmComm pshmcomm, MPI_Comm *comm)
215: {
216: PetscFunctionBegin;
217: PetscAssertPointer(pshmcomm, 1);
218: PetscAssertPointer(comm, 2);
219: *comm = pshmcomm->shmcomm;
220: PetscFunctionReturn(PETSC_SUCCESS);
221: }
223: #if defined(PETSC_HAVE_OPENMP_SUPPORT)
224: #include <pthread.h>
225: #include <hwloc.h>
226: #include <omp.h>
228: /* Use mmap() to allocate shared mmeory (for the pthread_barrier_t object) if it is available,
229: otherwise use MPI_Win_allocate_shared. They should have the same effect except MPI-3 is much
230: simpler to use. However, on a Cori Haswell node with Cray MPI, MPI-3 worsened a test's performance
231: by 50%. Until the reason is found out, we use mmap() instead.
232: */
233: #define USE_MMAP_ALLOCATE_SHARED_MEMORY
235: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
236: #include <sys/mman.h>
237: #include <sys/types.h>
238: #include <sys/stat.h>
239: #include <fcntl.h>
240: #endif
242: struct _n_PetscOmpCtrl {
243: MPI_Comm omp_comm; /* a shared memory communicator to spawn omp threads */
244: MPI_Comm omp_master_comm; /* a communicator to give to third party libraries */
245: PetscMPIInt omp_comm_size; /* size of omp_comm, a kind of OMP_NUM_THREADS */
246: PetscBool is_omp_master; /* rank 0's in omp_comm */
247: MPI_Win omp_win; /* a shared memory window containing a barrier */
248: pthread_barrier_t *barrier; /* pointer to the barrier */
249: hwloc_topology_t topology;
250: hwloc_cpuset_t cpuset; /* cpu bindings of omp master */
251: hwloc_cpuset_t omp_cpuset; /* union of cpu bindings of ranks in omp_comm */
252: };
254: /* Allocate and initialize a pthread_barrier_t object in memory shared by processes in omp_comm
255: contained by the controller.
257: PETSc OpenMP controller users do not call this function directly. This function exists
258: only because we want to separate shared memory allocation methods from other code.
259: */
260: static inline PetscErrorCode PetscOmpCtrlCreateBarrier(PetscOmpCtrl ctrl)
261: {
262: MPI_Aint size;
263: void *baseptr;
264: pthread_barrierattr_t attr;
266: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
267: int fd;
268: PetscChar pathname[PETSC_MAX_PATH_LEN];
269: #else
270: PetscMPIInt disp_unit;
271: #endif
273: PetscFunctionBegin;
274: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
275: size = sizeof(pthread_barrier_t);
276: if (ctrl->is_omp_master) {
277: /* use PETSC_COMM_SELF in PetscGetTmp, since it is a collective call. Using omp_comm would otherwise bcast the partially populated pathname to slaves */
278: PetscCall(PetscGetTmp(PETSC_COMM_SELF, pathname, PETSC_MAX_PATH_LEN));
279: PetscCall(PetscStrlcat(pathname, "/petsc-shm-XXXXXX", PETSC_MAX_PATH_LEN));
280: /* mkstemp replaces XXXXXX with a unique file name and opens the file for us */
281: fd = mkstemp(pathname);
282: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not create tmp file %s with mkstemp", pathname);
283: PetscCallExternal(ftruncate, fd, size);
284: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
285: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
286: PetscCallExternal(close, fd);
287: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
288: /* this MPI_Barrier is to wait slaves to open the file before master unlinks it */
289: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
290: PetscCallExternal(unlink, pathname);
291: } else {
292: PetscCallMPI(MPI_Bcast(pathname, PETSC_MAX_PATH_LEN, MPI_CHAR, 0, ctrl->omp_comm));
293: fd = open(pathname, O_RDWR);
294: PetscCheck(fd != -1, PETSC_COMM_SELF, PETSC_ERR_LIB, "Could not open tmp file %s", pathname);
295: baseptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
296: PetscCheck(baseptr != MAP_FAILED, PETSC_COMM_SELF, PETSC_ERR_LIB, "mmap() failed");
297: PetscCallExternal(close, fd);
298: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
299: }
300: #else
301: size = ctrl->is_omp_master ? sizeof(pthread_barrier_t) : 0;
302: PetscCallMPI(MPI_Win_allocate_shared(size, 1, MPI_INFO_NULL, ctrl->omp_comm, &baseptr, &ctrl->omp_win));
303: PetscCallMPI(MPI_Win_shared_query(ctrl->omp_win, 0, &size, &disp_unit, &baseptr));
304: #endif
305: ctrl->barrier = (pthread_barrier_t *)baseptr;
307: /* omp master initializes the barrier */
308: if (ctrl->is_omp_master) {
309: PetscCallMPI(MPI_Comm_size(ctrl->omp_comm, &ctrl->omp_comm_size));
310: PetscCallExternal(pthread_barrierattr_init, &attr);
311: PetscCallExternal(pthread_barrierattr_setpshared, &attr, PTHREAD_PROCESS_SHARED); /* make the barrier also work for processes */
312: PetscCallExternal(pthread_barrier_init, ctrl->barrier, &attr, (unsigned int)ctrl->omp_comm_size);
313: PetscCallExternal(pthread_barrierattr_destroy, &attr);
314: }
316: /* this MPI_Barrier is to make sure the omp barrier is initialized before slaves use it */
317: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
318: PetscFunctionReturn(PETSC_SUCCESS);
319: }
321: /* Destroy the pthread barrier in the PETSc OpenMP controller */
322: static inline PetscErrorCode PetscOmpCtrlDestroyBarrier(PetscOmpCtrl ctrl)
323: {
324: PetscFunctionBegin;
325: /* this MPI_Barrier is to make sure slaves have finished using the omp barrier before master destroys it */
326: PetscCallMPI(MPI_Barrier(ctrl->omp_comm));
327: if (ctrl->is_omp_master) PetscCallExternal(pthread_barrier_destroy, ctrl->barrier);
329: #if defined(USE_MMAP_ALLOCATE_SHARED_MEMORY) && defined(PETSC_HAVE_MMAP)
330: PetscCallExternal(munmap, ctrl->barrier, sizeof(pthread_barrier_t));
331: #else
332: PetscCallMPI(MPI_Win_free(&ctrl->omp_win));
333: #endif
334: PetscFunctionReturn(PETSC_SUCCESS);
335: }
337: /*@C
338: PetscOmpCtrlCreate - create a PETSc OpenMP controller, which manages PETSc's interaction with third party libraries that use OpenMP
340: Input Parameters:
341: + petsc_comm - a communicator some PETSc object (for example, a matrix) lives in
342: - nthreads - number of threads per MPI rank to spawn in a library using OpenMP. If nthreads = -1, let PETSc decide a suitable value
344: Output Parameter:
345: . pctrl - a PETSc OpenMP controller
347: Level: developer
349: Developer Note:
350: Possibly use the variable `PetscNumOMPThreads` to determine the number for threads to use
352: .seealso: `PetscOmpCtrlDestroy()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
353: @*/
354: PetscErrorCode PetscOmpCtrlCreate(MPI_Comm petsc_comm, PetscInt nthreads, PetscOmpCtrl *pctrl)
355: {
356: PetscOmpCtrl ctrl;
357: unsigned long *cpu_ulongs = NULL;
358: PetscShmComm pshmcomm;
359: MPI_Comm shm_comm;
360: PetscMPIInt shm_rank, shm_comm_size, omp_rank, color, nr_cpu_ulongs;
361: PetscInt num_packages, num_cores;
363: PetscFunctionBegin;
364: PetscCall(PetscNew(&ctrl));
366: /*=================================================================================
367: Init hwloc
368: ==================================================================================*/
369: PetscCallExternal(hwloc_topology_init, &ctrl->topology);
370: #if HWLOC_API_VERSION >= 0x00020000
371: /* to filter out unneeded info and have faster hwloc_topology_load */
372: PetscCallExternal(hwloc_topology_set_all_types_filter, ctrl->topology, HWLOC_TYPE_FILTER_KEEP_NONE);
373: PetscCallExternal(hwloc_topology_set_type_filter, ctrl->topology, HWLOC_OBJ_CORE, HWLOC_TYPE_FILTER_KEEP_ALL);
374: #endif
375: PetscCallExternal(hwloc_topology_load, ctrl->topology);
377: /*=================================================================================
378: Split petsc_comm into multiple omp_comms. Ranks in an omp_comm have access to
379: physically shared memory. Rank 0 of each omp_comm is called an OMP master, and
380: others are called slaves. OMP Masters make up a new comm called omp_master_comm,
381: which is usually passed to third party libraries.
382: ==================================================================================*/
384: /* fetch the stored shared memory communicator */
385: PetscCall(PetscShmCommGet(petsc_comm, &pshmcomm));
386: PetscCall(PetscShmCommGetMpiShmComm(pshmcomm, &shm_comm));
388: PetscCallMPI(MPI_Comm_rank(shm_comm, &shm_rank));
389: PetscCallMPI(MPI_Comm_size(shm_comm, &shm_comm_size));
391: /* PETSc decides nthreads, which is the smaller of shm_comm_size or cores per package(socket) */
392: if (nthreads == -1) {
393: num_packages = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_PACKAGE);
394: num_cores = hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE) <= 0 ? 1 : hwloc_get_nbobjs_by_type(ctrl->topology, HWLOC_OBJ_CORE);
395: nthreads = num_cores / num_packages;
396: if (nthreads > shm_comm_size) nthreads = shm_comm_size;
397: }
399: PetscCheck(nthreads >= 1 && nthreads <= shm_comm_size, petsc_comm, PETSC_ERR_ARG_OUTOFRANGE, "number of OpenMP threads %" PetscInt_FMT " can not be < 1 or > the MPI shared memory communicator size %d", nthreads, shm_comm_size);
400: if (shm_comm_size % nthreads) PetscCall(PetscPrintf(petsc_comm, "Warning: number of OpenMP threads %" PetscInt_FMT " is not a factor of the MPI shared memory communicator size %d, which may cause load-imbalance!\n", nthreads, shm_comm_size));
402: /* split shm_comm into a set of omp_comms with each of size nthreads. Ex., if
403: shm_comm_size=16, nthreads=8, then ranks 0~7 get color 0 and ranks 8~15 get
404: color 1. They are put in two omp_comms. Note that petsc_ranks may or may not
405: be consecutive in a shm_comm, but shm_ranks always run from 0 to shm_comm_size-1.
406: Use 0 as key so that rank ordering wont change in new comm.
407: */
408: color = shm_rank / nthreads;
409: PetscCallMPI(MPI_Comm_split(shm_comm, color, 0 /*key*/, &ctrl->omp_comm));
411: /* put rank 0's in omp_comms (i.e., master ranks) into a new comm - omp_master_comm */
412: PetscCallMPI(MPI_Comm_rank(ctrl->omp_comm, &omp_rank));
413: if (!omp_rank) {
414: ctrl->is_omp_master = PETSC_TRUE; /* master */
415: color = 0;
416: } else {
417: ctrl->is_omp_master = PETSC_FALSE; /* slave */
418: color = MPI_UNDEFINED; /* to make slaves get omp_master_comm = MPI_COMM_NULL in MPI_Comm_split */
419: }
420: PetscCallMPI(MPI_Comm_split(petsc_comm, color, 0 /*key*/, &ctrl->omp_master_comm));
422: /*=================================================================================
423: Each omp_comm has a pthread_barrier_t in its shared memory, which is used to put
424: slave ranks in sleep and idle their CPU, so that the master can fork OMP threads
425: and run them on the idle CPUs.
426: ==================================================================================*/
427: PetscCall(PetscOmpCtrlCreateBarrier(ctrl));
429: /*=================================================================================
430: omp master logs its cpu binding (i.e., cpu set) and computes a new binding that
431: is the union of the bindings of all ranks in the omp_comm
432: =================================================================================*/
434: ctrl->cpuset = hwloc_bitmap_alloc();
435: PetscCheck(ctrl->cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
436: PetscCallExternal(hwloc_get_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
438: /* hwloc main developer said they will add new APIs hwloc_bitmap_{nr,to,from}_ulongs in 2.1 to help us simplify the following bitmap pack/unpack code */
439: nr_cpu_ulongs = (hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(ctrl->topology)) + sizeof(unsigned long) * 8) / sizeof(unsigned long) / 8;
440: PetscCall(PetscMalloc1(nr_cpu_ulongs, &cpu_ulongs));
441: if (nr_cpu_ulongs == 1) {
442: cpu_ulongs[0] = hwloc_bitmap_to_ulong(ctrl->cpuset);
443: } else {
444: for (PetscInt i = 0; i < nr_cpu_ulongs; i++) cpu_ulongs[i] = hwloc_bitmap_to_ith_ulong(ctrl->cpuset, (unsigned)i);
445: }
447: PetscCallMPI(MPI_Reduce(ctrl->is_omp_master ? MPI_IN_PLACE : cpu_ulongs, cpu_ulongs, nr_cpu_ulongs, MPI_UNSIGNED_LONG, MPI_BOR, 0, ctrl->omp_comm));
449: if (ctrl->is_omp_master) {
450: ctrl->omp_cpuset = hwloc_bitmap_alloc();
451: PetscCheck(ctrl->omp_cpuset, PETSC_COMM_SELF, PETSC_ERR_LIB, "hwloc_bitmap_alloc() failed");
452: if (nr_cpu_ulongs == 1) {
453: #if HWLOC_API_VERSION >= 0x00020000
454: PetscCallExternal(hwloc_bitmap_from_ulong, ctrl->omp_cpuset, cpu_ulongs[0]);
455: #else
456: hwloc_bitmap_from_ulong(ctrl->omp_cpuset, cpu_ulongs[0]);
457: #endif
458: } else {
459: for (PetscInt i = 0; i < nr_cpu_ulongs; i++) {
460: #if HWLOC_API_VERSION >= 0x00020000
461: PetscCallExternal(hwloc_bitmap_set_ith_ulong, ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
462: #else
463: hwloc_bitmap_set_ith_ulong(ctrl->omp_cpuset, (unsigned)i, cpu_ulongs[i]);
464: #endif
465: }
466: }
467: }
468: PetscCall(PetscFree(cpu_ulongs));
469: *pctrl = ctrl;
470: PetscFunctionReturn(PETSC_SUCCESS);
471: }
473: /*@C
474: PetscOmpCtrlDestroy - destroy the PETSc OpenMP controller
476: Input Parameter:
477: . pctrl - a PETSc OpenMP controller
479: Level: developer
481: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlGetOmpComms()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
482: @*/
483: PetscErrorCode PetscOmpCtrlDestroy(PetscOmpCtrl *pctrl)
484: {
485: PetscOmpCtrl ctrl = *pctrl;
487: PetscFunctionBegin;
488: hwloc_bitmap_free(ctrl->cpuset);
489: hwloc_topology_destroy(ctrl->topology);
490: PetscCall(PetscOmpCtrlDestroyBarrier(ctrl));
491: PetscCallMPI(MPI_Comm_free(&ctrl->omp_comm));
492: if (ctrl->is_omp_master) {
493: hwloc_bitmap_free(ctrl->omp_cpuset);
494: PetscCallMPI(MPI_Comm_free(&ctrl->omp_master_comm));
495: }
496: PetscCall(PetscFree(ctrl));
497: PetscFunctionReturn(PETSC_SUCCESS);
498: }
500: /*@C
501: PetscOmpCtrlGetOmpComms - Get MPI communicators from a PETSc OMP controller
503: Input Parameter:
504: . ctrl - a PETSc OMP controller
506: Output Parameters:
507: + omp_comm - a communicator that includes a master rank and slave ranks where master spawns threads
508: . omp_master_comm - on master ranks, return a communicator that include master ranks of each omp_comm;
509: on slave ranks, `MPI_COMM_NULL` will be return in reality.
510: - is_omp_master - true if the calling process is an OMP master rank.
512: Note:
513: Any output parameter can be `NULL`. The parameter is just ignored.
515: Level: developer
517: .seealso: `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`, `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`,
518: @*/
519: PetscErrorCode PetscOmpCtrlGetOmpComms(PetscOmpCtrl ctrl, MPI_Comm *omp_comm, MPI_Comm *omp_master_comm, PetscBool *is_omp_master)
520: {
521: PetscFunctionBegin;
522: if (omp_comm) *omp_comm = ctrl->omp_comm;
523: if (omp_master_comm) *omp_master_comm = ctrl->omp_master_comm;
524: if (is_omp_master) *is_omp_master = ctrl->is_omp_master;
525: PetscFunctionReturn(PETSC_SUCCESS);
526: }
528: /*@C
529: PetscOmpCtrlBarrier - Do barrier on MPI ranks in omp_comm contained by the PETSc OMP controller (to let slave ranks free their CPU)
531: Input Parameter:
532: . ctrl - a PETSc OMP controller
534: Notes:
535: this is a pthread barrier on MPI ranks. Using `MPI_Barrier()` instead is conceptually correct. But MPI standard does not
536: require processes blocked by `MPI_Barrier()` free their CPUs to let other processes progress. In practice, to minilize latency,
537: MPI ranks stuck in `MPI_Barrier()` keep polling and do not free CPUs. In contrast, pthread_barrier has this requirement.
539: A code using `PetscOmpCtrlBarrier()` would be like this,
540: .vb
541: if (is_omp_master) {
542: PetscOmpCtrlOmpRegionOnMasterBegin(ctrl);
543: Call the library using OpenMP
544: PetscOmpCtrlOmpRegionOnMasterEnd(ctrl);
545: }
546: PetscOmpCtrlBarrier(ctrl);
547: .ve
549: Level: developer
551: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`,
552: @*/
553: PetscErrorCode PetscOmpCtrlBarrier(PetscOmpCtrl ctrl)
554: {
555: int err;
557: PetscFunctionBegin;
558: err = pthread_barrier_wait(ctrl->barrier);
559: PetscCheck(!err || err == PTHREAD_BARRIER_SERIAL_THREAD, PETSC_COMM_SELF, PETSC_ERR_LIB, "pthread_barrier_wait failed within PetscOmpCtrlBarrier with return code %d", err);
560: PetscFunctionReturn(PETSC_SUCCESS);
561: }
563: /*@C
564: PetscOmpCtrlOmpRegionOnMasterBegin - Mark the beginning of an OpenMP library call on master ranks
566: Input Parameter:
567: . ctrl - a PETSc OMP controller
569: Note:
570: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
571: This function changes CPU binding of master ranks and nthreads-var of OpenMP runtime
573: Level: developer
575: .seealso: `PetscOmpCtrlOmpRegionOnMasterEnd()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
576: @*/
577: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterBegin(PetscOmpCtrl ctrl)
578: {
579: PetscFunctionBegin;
580: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->omp_cpuset, HWLOC_CPUBIND_PROCESS);
581: omp_set_num_threads(ctrl->omp_comm_size); /* may override the OMP_NUM_THREAD env var */
582: PetscFunctionReturn(PETSC_SUCCESS);
583: }
585: /*@C
586: PetscOmpCtrlOmpRegionOnMasterEnd - Mark the end of an OpenMP library call on master ranks
588: Input Parameter:
589: . ctrl - a PETSc OMP controller
591: Note:
592: Only master ranks can call this function. Call `PetscOmpCtrlGetOmpComms()` to know if this is a master rank.
593: This function restores the CPU binding of master ranks and set and nthreads-var of OpenMP runtime to 1.
595: Level: developer
597: .seealso: `PetscOmpCtrlOmpRegionOnMasterBegin()`, `PetscOmpCtrlCreate()`, `PetscOmpCtrlDestroy()`, `PetscOmpCtrlBarrier()`
598: @*/
599: PetscErrorCode PetscOmpCtrlOmpRegionOnMasterEnd(PetscOmpCtrl ctrl)
600: {
601: PetscFunctionBegin;
602: PetscCallExternal(hwloc_set_cpubind, ctrl->topology, ctrl->cpuset, HWLOC_CPUBIND_PROCESS);
603: omp_set_num_threads(1);
604: PetscFunctionReturn(PETSC_SUCCESS);
605: }
607: #undef USE_MMAP_ALLOCATE_SHARED_MEMORY
608: #endif /* defined(PETSC_HAVE_OPENMP_SUPPORT) */