Actual source code: sfwindow.c

  1: #include <petsc/private/sfimpl.h>

  3: typedef struct _n_PetscSFDataLink *PetscSFDataLink;
  4: typedef struct _n_PetscSFWinLink  *PetscSFWinLink;

  6: typedef struct {
  7:   PetscSFWindowSyncType   sync;   /* FENCE, LOCK, or ACTIVE synchronization */
  8:   PetscSFDataLink         link;   /* List of MPI data types, lazily constructed for each data type */
  9:   PetscSFWinLink          wins;   /* List of active windows */
 10:   PetscSFWindowFlavorType flavor; /* Current PETSCSF_WINDOW_FLAVOR_ */
 11:   PetscSF                 dynsf;
 12:   MPI_Info                info;
 13: } PetscSF_Window;

 15: struct _n_PetscSFDataLink {
 16:   MPI_Datatype    unit;
 17:   MPI_Datatype   *mine;
 18:   MPI_Datatype   *remote;
 19:   PetscSFDataLink next;
 20: };

 22: struct _n_PetscSFWinLink {
 23:   PetscBool               inuse;
 24:   size_t                  bytes;
 25:   void                   *addr;
 26:   void                   *paddr;
 27:   MPI_Win                 win;
 28:   MPI_Request            *reqs;
 29:   PetscSFWindowFlavorType flavor;
 30:   MPI_Aint               *dyn_target_addr;
 31:   PetscBool               epoch;
 32:   PetscSFWinLink          next;
 33: };

 35: const char *const PetscSFWindowSyncTypes[]   = {"FENCE", "LOCK", "ACTIVE", "PetscSFWindowSyncType", "PETSCSF_WINDOW_SYNC_", NULL};
 36: const char *const PetscSFWindowFlavorTypes[] = {"CREATE", "DYNAMIC", "ALLOCATE", "SHARED", "PetscSFWindowFlavorType", "PETSCSF_WINDOW_FLAVOR_", NULL};

 38: /* Built-in MPI_Ops act elementwise inside MPI_Accumulate, but cannot be used with composite types inside collectives (MPI_Allreduce) */
 39: static PetscErrorCode PetscSFWindowOpTranslate(MPI_Op *op)
 40: {
 41:   if (*op == MPIU_SUM) *op = MPI_SUM;
 42:   else if (*op == MPIU_MAX) *op = MPI_MAX;
 43:   else if (*op == MPIU_MIN) *op = MPI_MIN;
 44:   return 0;
 45: }

 47: /*@C
 48:    PetscSFWindowGetDataTypes - gets composite local and remote data types for each rank

 50:    Not Collective

 52:    Input Parameters:
 53: +  sf - star forest
 54: -  unit - data type for each node

 56:    Output Parameters:
 57: +  localtypes - types describing part of local leaf buffer referencing each remote rank
 58: -  remotetypes - types describing part of remote root buffer referenced for each remote rank

 60:    Level: developer

 62: .seealso: `PetscSFSetGraph()`, `PetscSFView()`
 63: @*/
 64: static PetscErrorCode PetscSFWindowGetDataTypes(PetscSF sf, MPI_Datatype unit, const MPI_Datatype **localtypes, const MPI_Datatype **remotetypes)
 65: {
 66:   PetscSF_Window    *w = (PetscSF_Window *)sf->data;
 67:   PetscSFDataLink    link;
 68:   PetscInt           i, nranks;
 69:   const PetscInt    *roffset, *rmine, *rremote;
 70:   const PetscMPIInt *ranks;

 72:   /* Look for types in cache */
 73:   for (link = w->link; link; link = link->next) {
 74:     PetscBool match;
 75:     MPIPetsc_Type_compare(unit, link->unit, &match);
 76:     if (match) {
 77:       *localtypes  = link->mine;
 78:       *remotetypes = link->remote;
 79:       return 0;
 80:     }
 81:   }

 83:   /* Create new composite types for each send rank */
 84:   PetscSFGetRootRanks(sf, &nranks, &ranks, &roffset, &rmine, &rremote);
 85:   PetscNew(&link);
 86:   MPI_Type_dup(unit, &link->unit);
 87:   PetscMalloc2(nranks, &link->mine, nranks, &link->remote);
 88:   for (i = 0; i < nranks; i++) {
 89:     PetscInt     rcount = roffset[i + 1] - roffset[i];
 90:     PetscMPIInt *rmine, *rremote;
 91: #if !defined(PETSC_USE_64BIT_INDICES)
 92:     rmine   = sf->rmine + sf->roffset[i];
 93:     rremote = sf->rremote + sf->roffset[i];
 94: #else
 95:     PetscInt j;
 96:     PetscMalloc2(rcount, &rmine, rcount, &rremote);
 97:     for (j = 0; j < rcount; j++) {
 98:       PetscMPIIntCast(sf->rmine[sf->roffset[i] + j], rmine + j);
 99:       PetscMPIIntCast(sf->rremote[sf->roffset[i] + j], rremote + j);
100:     }
101: #endif

103:     MPI_Type_create_indexed_block(rcount, 1, rmine, link->unit, &link->mine[i]);
104:     MPI_Type_create_indexed_block(rcount, 1, rremote, link->unit, &link->remote[i]);
105: #if defined(PETSC_USE_64BIT_INDICES)
106:     PetscFree2(rmine, rremote);
107: #endif
108:     MPI_Type_commit(&link->mine[i]);
109:     MPI_Type_commit(&link->remote[i]);
110:   }
111:   link->next = w->link;
112:   w->link    = link;

114:   *localtypes  = link->mine;
115:   *remotetypes = link->remote;
116:   return 0;
117: }

119: /*@C
120:    PetscSFWindowSetFlavorType - Set flavor type for MPI_Win creation

122:    Logically Collective

124:    Input Parameters:
125: +  sf - star forest for communication
126: -  flavor - flavor type

128:    Options Database Key:
129: .  -sf_window_flavor <flavor> - sets the flavor type CREATE, DYNAMIC, ALLOCATE or SHARED (see PetscSFWindowFlavorType)

131:    Level: advanced

133:    Notes: Windows reusage follow this rules:

135:      PETSCSF_WINDOW_FLAVOR_CREATE: creates a new window every time, uses MPI_Win_create

137:      PETSCSF_WINDOW_FLAVOR_DYNAMIC: uses MPI_Win_create_dynamic/MPI_Win_attach and tries to reuse windows by comparing the root array. Intended to be used on repeated applications of the same SF, e.g.
138:        for i=1 to K
139:          PetscSFOperationBegin(rootdata1,leafdata_whatever);
140:          PetscSFOperationEnd(rootdata1,leafdata_whatever);
141:          ...
142:          PetscSFOperationBegin(rootdataN,leafdata_whatever);
143:          PetscSFOperationEnd(rootdataN,leafdata_whatever);
144:        endfor
145:        The following pattern will instead raise an error
146:          PetscSFOperationBegin(rootdata1,leafdata_whatever);
147:          PetscSFOperationEnd(rootdata1,leafdata_whatever);
148:          PetscSFOperationBegin(rank ? rootdata1 : rootdata2,leafdata_whatever);
149:          PetscSFOperationEnd(rank ? rootdata1 : rootdata2,leafdata_whatever);

151:      PETSCSF_WINDOW_FLAVOR_ALLOCATE: uses MPI_Win_allocate, reuses any pre-existing window which fits the data and it is not in use

153:      PETSCSF_WINDOW_FLAVOR_SHARED: uses MPI_Win_allocate_shared, reusage policy as for PETSCSF_WINDOW_FLAVOR_ALLOCATE

155: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowGetFlavorType()`
156: @*/
157: PetscErrorCode PetscSFWindowSetFlavorType(PetscSF sf, PetscSFWindowFlavorType flavor)
158: {
161:   PetscTryMethod(sf, "PetscSFWindowSetFlavorType_C", (PetscSF, PetscSFWindowFlavorType), (sf, flavor));
162:   return 0;
163: }

165: static PetscErrorCode PetscSFWindowSetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType flavor)
166: {
167:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

169:   w->flavor = flavor;
170:   return 0;
171: }

173: /*@C
174:    PetscSFWindowGetFlavorType - Get flavor type for PetscSF communication

176:    Logically Collective

178:    Input Parameter:
179: .  sf - star forest for communication

181:    Output Parameter:
182: .  flavor - flavor type

184:    Level: advanced

186: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowSetFlavorType()`
187: @*/
188: PetscErrorCode PetscSFWindowGetFlavorType(PetscSF sf, PetscSFWindowFlavorType *flavor)
189: {
192:   PetscUseMethod(sf, "PetscSFWindowGetFlavorType_C", (PetscSF, PetscSFWindowFlavorType *), (sf, flavor));
193:   return 0;
194: }

196: static PetscErrorCode PetscSFWindowGetFlavorType_Window(PetscSF sf, PetscSFWindowFlavorType *flavor)
197: {
198:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

200:   *flavor = w->flavor;
201:   return 0;
202: }

204: /*@C
205:    PetscSFWindowSetSyncType - Set synchronization type for PetscSF communication

207:    Logically Collective

209:    Input Parameters:
210: +  sf - star forest for communication
211: -  sync - synchronization type

213:    Options Database Key:
214: .  -sf_window_sync <sync> - sets the synchronization type FENCE, LOCK, or ACTIVE (see PetscSFWindowSyncType)

216:    Level: advanced

218: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowGetSyncType()`
219: @*/
220: PetscErrorCode PetscSFWindowSetSyncType(PetscSF sf, PetscSFWindowSyncType sync)
221: {
224:   PetscTryMethod(sf, "PetscSFWindowSetSyncType_C", (PetscSF, PetscSFWindowSyncType), (sf, sync));
225:   return 0;
226: }

228: static PetscErrorCode PetscSFWindowSetSyncType_Window(PetscSF sf, PetscSFWindowSyncType sync)
229: {
230:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

232:   w->sync = sync;
233:   return 0;
234: }

236: /*@C
237:    PetscSFWindowGetSyncType - Get synchronization type for PetscSF communication

239:    Logically Collective

241:    Input Parameter:
242: .  sf - star forest for communication

244:    Output Parameter:
245: .  sync - synchronization type

247:    Level: advanced

249: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowSetSyncType()`
250: @*/
251: PetscErrorCode PetscSFWindowGetSyncType(PetscSF sf, PetscSFWindowSyncType *sync)
252: {
255:   PetscUseMethod(sf, "PetscSFWindowGetSyncType_C", (PetscSF, PetscSFWindowSyncType *), (sf, sync));
256:   return 0;
257: }

259: static PetscErrorCode PetscSFWindowGetSyncType_Window(PetscSF sf, PetscSFWindowSyncType *sync)
260: {
261:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

263:   *sync = w->sync;
264:   return 0;
265: }

267: /*@C
268:    PetscSFWindowSetInfo - Set the MPI_Info handle that will be used for subsequent windows allocation

270:    Logically Collective

272:    Input Parameters:
273: +  sf - star forest for communication
274: -  info - MPI_Info handle

276:    Level: advanced

278:    Notes: the info handle is duplicated with a call to MPI_Info_dup unless info = MPI_INFO_NULL.

280: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowGetInfo()`
281: @*/
282: PetscErrorCode PetscSFWindowSetInfo(PetscSF sf, MPI_Info info)
283: {
285:   PetscTryMethod(sf, "PetscSFWindowSetInfo_C", (PetscSF, MPI_Info), (sf, info));
286:   return 0;
287: }

289: static PetscErrorCode PetscSFWindowSetInfo_Window(PetscSF sf, MPI_Info info)
290: {
291:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

293:   if (w->info != MPI_INFO_NULL) MPI_Info_free(&w->info);
294:   if (info != MPI_INFO_NULL) MPI_Info_dup(info, &w->info);
295:   return 0;
296: }

298: /*@C
299:    PetscSFWindowGetInfo - Get the MPI_Info handle used for windows allocation

301:    Logically Collective

303:    Input Parameter:
304: .  sf - star forest for communication

306:    Output Parameter:
307: .  info - MPI_Info handle

309:    Level: advanced

311:    Notes: if PetscSFWindowSetInfo() has not be called, this returns MPI_INFO_NULL

313: .seealso: `PetscSFSetFromOptions()`, `PetscSFWindowSetInfo()`
314: @*/
315: PetscErrorCode PetscSFWindowGetInfo(PetscSF sf, MPI_Info *info)
316: {
319:   PetscUseMethod(sf, "PetscSFWindowGetInfo_C", (PetscSF, MPI_Info *), (sf, info));
320:   return 0;
321: }

323: static PetscErrorCode PetscSFWindowGetInfo_Window(PetscSF sf, MPI_Info *info)
324: {
325:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

327:   *info = w->info;
328:   return 0;
329: }

331: /*
332:    PetscSFGetWindow - Get a window for use with a given data type

334:    Collective on PetscSF

336:    Input Parameters:
337: +  sf - star forest
338: .  unit - data type
339: .  array - array to be sent
340: .  sync - type of synchronization PetscSFWindowSyncType
341: .  epoch - PETSC_TRUE to acquire the window and start an epoch, PETSC_FALSE to just acquire the window
342: .  fenceassert - assert parameter for call to MPI_Win_fence(), if sync == PETSCSF_WINDOW_SYNC_FENCE
343: .  postassert - assert parameter for call to MPI_Win_post(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE
344: -  startassert - assert parameter for call to MPI_Win_start(), if sync == PETSCSF_WINDOW_SYNC_ACTIVE

346:    Output Parameters:
347: +  target_disp - target_disp argument for RMA calls (significative for PETSCSF_WINDOW_FLAVOR_DYNAMIC only)
348: +  reqs - array of requests (significative for sync == PETSCSF_WINDOW_SYNC_LOCK only)
349: -  win - window

351:    Level: developer
352: .seealso: `PetscSFGetRootRanks()`, `PetscSFWindowGetDataTypes()`
353: */
354: static PetscErrorCode PetscSFGetWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscMPIInt postassert, PetscMPIInt startassert, const MPI_Aint **target_disp, MPI_Request **reqs, MPI_Win *win)
355: {
356:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
357:   MPI_Aint        lb, lb_true, bytes, bytes_true;
358:   PetscSFWinLink  link;
359: #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
360:   MPI_Aint winaddr;
361:   PetscInt nranks;
362: #endif
363:   PetscBool reuse = PETSC_FALSE, update = PETSC_FALSE;
364:   PetscBool dummy[2];
365:   MPI_Aint  wsize;

367:   MPI_Type_get_extent(unit, &lb, &bytes);
368:   MPI_Type_get_true_extent(unit, &lb_true, &bytes_true);
371:   if (w->flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
372:   for (link = w->wins; reuse && link; link = link->next) {
373:     PetscBool winok = PETSC_FALSE;
374:     if (w->flavor != link->flavor) continue;
375:     switch (w->flavor) {
376:     case PETSCSF_WINDOW_FLAVOR_DYNAMIC: /* check available matching array, error if in use (we additionally check that the matching condition is the same across processes) */
377:       if (array == link->addr) {
378:         if (PetscDefined(USE_DEBUG)) {
379:           dummy[0] = PETSC_TRUE;
380:           dummy[1] = PETSC_TRUE;
381:           MPI_Allreduce(MPI_IN_PLACE, dummy, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)sf));
382:           MPI_Allreduce(MPI_IN_PLACE, dummy + 1, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)sf));
384:         }
387:         winok       = PETSC_TRUE;
388:         link->paddr = array;
389:       } else if (PetscDefined(USE_DEBUG)) {
390:         dummy[0] = PETSC_FALSE;
391:         dummy[1] = PETSC_FALSE;
392:         MPI_Allreduce(MPI_IN_PLACE, dummy, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)sf));
393:         MPI_Allreduce(MPI_IN_PLACE, dummy + 1, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)sf));
395:       }
396:       break;
397:     case PETSCSF_WINDOW_FLAVOR_ALLOCATE: /* check available by matching size, allocate if in use */
398:     case PETSCSF_WINDOW_FLAVOR_SHARED:
399:       if (!link->inuse && bytes == (MPI_Aint)link->bytes) {
400:         update      = PETSC_TRUE;
401:         link->paddr = array;
402:         winok       = PETSC_TRUE;
403:       }
404:       break;
405:     default:
406:       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
407:     }
408:     if (winok) {
409:       *win = link->win;
410:       PetscInfo(sf, "Reusing window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n", link->win, link->flavor, PetscObjectComm((PetscObject)sf));
411:       goto found;
412:     }
413:   }

415:   wsize = (MPI_Aint)bytes * sf->nroots;
416:   PetscNew(&link);
417:   link->bytes           = bytes;
418:   link->next            = w->wins;
419:   link->flavor          = w->flavor;
420:   link->dyn_target_addr = NULL;
421:   link->reqs            = NULL;
422:   w->wins               = link;
423:   if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
424:     PetscInt i;

426:     PetscMalloc1(sf->nranks, &link->reqs);
427:     for (i = 0; i < sf->nranks; i++) link->reqs[i] = MPI_REQUEST_NULL;
428:   }
429:   switch (w->flavor) {
430:   case PETSCSF_WINDOW_FLAVOR_CREATE:
431:     MPI_Win_create(array, wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->win);
432:     link->addr  = array;
433:     link->paddr = array;
434:     break;
435: #if defined(PETSC_HAVE_MPI_FEATURE_DYNAMIC_WINDOW)
436:   case PETSCSF_WINDOW_FLAVOR_DYNAMIC:
437:     MPI_Win_create_dynamic(w->info, PetscObjectComm((PetscObject)sf), &link->win);
438:   #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) /* some OpenMPI versions do not support MPI_Win_attach(win,NULL,0); */
439:     MPI_Win_attach(link->win, wsize ? array : (void *)dummy, wsize);
440:   #else
441:     MPI_Win_attach(link->win, array, wsize);
442:   #endif
443:     link->addr  = array;
444:     link->paddr = array;
446:     PetscSFSetUp(w->dynsf);
447:     PetscSFGetRootRanks(w->dynsf, &nranks, NULL, NULL, NULL, NULL);
448:     PetscMalloc1(nranks, &link->dyn_target_addr);
449:     MPI_Get_address(array, &winaddr);
450:     PetscSFBcastBegin(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE);
451:     PetscSFBcastEnd(w->dynsf, MPI_AINT, &winaddr, link->dyn_target_addr, MPI_REPLACE);
452:     break;
453:   case PETSCSF_WINDOW_FLAVOR_ALLOCATE:
454:     MPI_Win_allocate(wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->addr, &link->win);
455:     update      = PETSC_TRUE;
456:     link->paddr = array;
457:     break;
458: #endif
459: #if defined(PETSC_HAVE_MPI_PROCESS_SHARED_MEMORY)
460:   case PETSCSF_WINDOW_FLAVOR_SHARED:
461:     MPI_Win_allocate_shared(wsize, (PetscMPIInt)bytes, w->info, PetscObjectComm((PetscObject)sf), &link->addr, &link->win);
462:     update      = PETSC_TRUE;
463:     link->paddr = array;
464:     break;
465: #endif
466:   default:
467:     SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_SUP, "No support for flavor %s", PetscSFWindowFlavorTypes[w->flavor]);
468:   }
469:   PetscInfo(sf, "New window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n", link->win, link->flavor, PetscObjectComm((PetscObject)sf));
470:   *win = link->win;

472: found:

474:   if (target_disp) *target_disp = link->dyn_target_addr;
475:   if (reqs) *reqs = link->reqs;
476:   if (update) { /* locks are needed for the "separate" memory model only, the fence guaranties memory-synchronization */
477:     PetscMPIInt rank;

479:     MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank);
480:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, MPI_MODE_NOCHECK, *win);
481:     PetscMemcpy(link->addr, array, sf->nroots * bytes);
482:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) {
483:       MPI_Win_unlock(rank, *win);
484:       MPI_Win_fence(0, *win);
485:     }
486:   }
487:   link->inuse = PETSC_TRUE;
488:   link->epoch = epoch;
489:   if (epoch) {
490:     switch (sync) {
491:     case PETSCSF_WINDOW_SYNC_FENCE:
492:       MPI_Win_fence(fenceassert, *win);
493:       break;
494:     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
495:       break;
496:     case PETSCSF_WINDOW_SYNC_ACTIVE: {
497:       MPI_Group   ingroup, outgroup;
498:       PetscMPIInt isize, osize;

500:       /* OpenMPI 4.0.2 with btl=vader does not like calling
501:          - MPI_Win_complete when ogroup is empty
502:          - MPI_Win_wait when igroup is empty
503:          So, we do not even issue the corresponding start and post calls
504:          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
505:          start(outgroup) has a matching post(ingroup)
506:          and this is guaranteed by PetscSF
507:       */
508:       PetscSFGetGroups(sf, &ingroup, &outgroup);
509:       MPI_Group_size(ingroup, &isize);
510:       MPI_Group_size(outgroup, &osize);
511:       if (isize) MPI_Win_post(ingroup, postassert, *win);
512:       if (osize) MPI_Win_start(outgroup, startassert, *win);
513:     } break;
514:     default:
515:       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Unknown synchronization type");
516:     }
517:   }
518:   return 0;
519: }

521: /*
522:    PetscSFFindWindow - Finds a window that is already in use

524:    Not Collective

526:    Input Parameters:
527: +  sf - star forest
528: .  unit - data type
529: -  array - array with which the window is associated

531:    Output Parameters:
532: +  win - window
533: -  reqs - outstanding requests associated to the window

535:    Level: developer

537: .seealso: `PetscSFGetWindow()`, `PetscSFRestoreWindow()`
538: */
539: static PetscErrorCode PetscSFFindWindow(PetscSF sf, MPI_Datatype unit, const void *array, MPI_Win *win, MPI_Request **reqs)
540: {
541:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
542:   PetscSFWinLink  link;

544:   *win = MPI_WIN_NULL;
545:   for (link = w->wins; link; link = link->next) {
546:     if (array == link->paddr) {
547:       PetscInfo(sf, "Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n", link->win, link->flavor, PetscObjectComm((PetscObject)sf));
548:       *win  = link->win;
549:       *reqs = link->reqs;
550:       return 0;
551:     }
552:   }
553:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");
554: }

556: /*
557:    PetscSFRestoreWindow - Restores a window obtained with PetscSFGetWindow()

559:    Collective

561:    Input Parameters:
562: +  sf - star forest
563: .  unit - data type
564: .  array - array associated with window
565: .  sync - type of synchronization PetscSFWindowSyncType
566: .  epoch - close an epoch, must match argument to PetscSFGetWindow()
567: .  update - if we have to update the local window array
568: -  win - window

570:    Level: developer

572: .seealso: `PetscSFFindWindow()`
573: */
574: static PetscErrorCode PetscSFRestoreWindow(PetscSF sf, MPI_Datatype unit, void *array, PetscSFWindowSyncType sync, PetscBool epoch, PetscMPIInt fenceassert, PetscBool update, MPI_Win *win)
575: {
576:   PetscSF_Window         *w = (PetscSF_Window *)sf->data;
577:   PetscSFWinLink         *p, link;
578:   PetscBool               reuse = PETSC_FALSE;
579:   PetscSFWindowFlavorType flavor;
580:   void                   *laddr;
581:   size_t                  bytes;

583:   for (p = &w->wins; *p; p = &(*p)->next) {
584:     link = *p;
585:     if (*win == link->win) {
587:       if (epoch != link->epoch) {
589:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Restoring window without ending epoch");
590:       }
591:       laddr  = link->addr;
592:       flavor = link->flavor;
593:       bytes  = link->bytes;
594:       if (flavor != PETSCSF_WINDOW_FLAVOR_CREATE) reuse = PETSC_TRUE;
595:       else {
596:         *p     = link->next;
597:         update = PETSC_FALSE;
598:       } /* remove from list */
599:       goto found;
600:     }
601:   }
602:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Requested window not in use");

604: found:
605:   PetscInfo(sf, "Window %" PETSC_MPI_WIN_FMT " of flavor %d for comm %" PETSC_MPI_COMM_FMT "\n", link->win, link->flavor, PetscObjectComm((PetscObject)sf));
606:   if (epoch) {
607:     switch (sync) {
608:     case PETSCSF_WINDOW_SYNC_FENCE:
609:       MPI_Win_fence(fenceassert, *win);
610:       break;
611:     case PETSCSF_WINDOW_SYNC_LOCK: /* Handled outside */
612:       break;
613:     case PETSCSF_WINDOW_SYNC_ACTIVE: {
614:       MPI_Group   ingroup, outgroup;
615:       PetscMPIInt isize, osize;

617:       /* OpenMPI 4.0.2 with btl=wader does not like calling
618:          - MPI_Win_complete when ogroup is empty
619:          - MPI_Win_wait when igroup is empty
620:          The MPI standard (Sec. 11.5.2 of MPI 3.1) only requires that
621:          - each process who issues a call to MPI_Win_start issues a call to MPI_Win_Complete
622:          - each process who issues a call to MPI_Win_post issues a call to MPI_Win_Wait
623:       */
624:       PetscSFGetGroups(sf, &ingroup, &outgroup);
625:       MPI_Group_size(ingroup, &isize);
626:       MPI_Group_size(outgroup, &osize);
627:       if (osize) MPI_Win_complete(*win);
628:       if (isize) MPI_Win_wait(*win);
629:     } break;
630:     default:
631:       SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_PLIB, "Unknown synchronization type");
632:     }
633:   }
634:   if (update) {
635:     if (sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, *win);
636:     PetscMemcpy(array, laddr, sf->nroots * bytes);
637:   }
638:   link->epoch = PETSC_FALSE;
639:   link->inuse = PETSC_FALSE;
640:   link->paddr = NULL;
641:   if (!reuse) {
642:     PetscFree(link->dyn_target_addr);
643:     PetscFree(link->reqs);
644:     MPI_Win_free(&link->win);
645:     PetscFree(link);
646:     *win = MPI_WIN_NULL;
647:   }
648:   return 0;
649: }

651: static PetscErrorCode PetscSFSetUp_Window(PetscSF sf)
652: {
653:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
654:   MPI_Group       ingroup, outgroup;

656:   PetscSFSetUpRanks(sf, MPI_GROUP_EMPTY);
657:   if (!w->dynsf) {
658:     PetscInt     i;
659:     PetscSFNode *remotes;

661:     PetscMalloc1(sf->nranks, &remotes);
662:     for (i = 0; i < sf->nranks; i++) {
663:       remotes[i].rank  = sf->ranks[i];
664:       remotes[i].index = 0;
665:     }
666:     PetscSFDuplicate(sf, PETSCSF_DUPLICATE_RANKS, &w->dynsf);
667:     PetscSFWindowSetFlavorType(w->dynsf, PETSCSF_WINDOW_FLAVOR_CREATE); /* break recursion */
668:     PetscSFSetGraph(w->dynsf, 1, sf->nranks, NULL, PETSC_OWN_POINTER, remotes, PETSC_OWN_POINTER);
669:   }
670:   switch (w->sync) {
671:   case PETSCSF_WINDOW_SYNC_ACTIVE:
672:     PetscSFGetGroups(sf, &ingroup, &outgroup);
673:   default:
674:     break;
675:   }
676:   return 0;
677: }

679: static PetscErrorCode PetscSFSetFromOptions_Window(PetscSF sf, PetscOptionItems *PetscOptionsObject)
680: {
681:   PetscSF_Window         *w      = (PetscSF_Window *)sf->data;
682:   PetscSFWindowFlavorType flavor = w->flavor;

684:   PetscOptionsHeadBegin(PetscOptionsObject, "PetscSF Window options");
685:   PetscOptionsEnum("-sf_window_sync", "synchronization type to use for PetscSF Window communication", "PetscSFWindowSetSyncType", PetscSFWindowSyncTypes, (PetscEnum)w->sync, (PetscEnum *)&w->sync, NULL);
686:   PetscOptionsEnum("-sf_window_flavor", "flavor to use for PetscSF Window creation", "PetscSFWindowSetFlavorType", PetscSFWindowFlavorTypes, (PetscEnum)flavor, (PetscEnum *)&flavor, NULL);
687:   PetscSFWindowSetFlavorType(sf, flavor);
688:   PetscOptionsHeadEnd();
689:   return 0;
690: }

692: static PetscErrorCode PetscSFReset_Window(PetscSF sf)
693: {
694:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
695:   PetscSFDataLink link, next;
696:   PetscSFWinLink  wlink, wnext;
697:   PetscInt        i;

699:   for (link = w->link; link; link = next) {
700:     next = link->next;
701:     MPI_Type_free(&link->unit);
702:     for (i = 0; i < sf->nranks; i++) {
703:       MPI_Type_free(&link->mine[i]);
704:       MPI_Type_free(&link->remote[i]);
705:     }
706:     PetscFree2(link->mine, link->remote);
707:     PetscFree(link);
708:   }
709:   w->link = NULL;
710:   for (wlink = w->wins; wlink; wlink = wnext) {
711:     wnext = wlink->next;
713:     PetscFree(wlink->dyn_target_addr);
714:     PetscFree(wlink->reqs);
715:     MPI_Win_free(&wlink->win);
716:     PetscFree(wlink);
717:   }
718:   w->wins = NULL;
719:   PetscSFDestroy(&w->dynsf);
720:   if (w->info != MPI_INFO_NULL) MPI_Info_free(&w->info);
721:   return 0;
722: }

724: static PetscErrorCode PetscSFDestroy_Window(PetscSF sf)
725: {
726:   PetscSFReset_Window(sf);
727:   PetscFree(sf->data);
728:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", NULL);
729:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", NULL);
730:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", NULL);
731:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", NULL);
732:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", NULL);
733:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", NULL);
734:   return 0;
735: }

737: static PetscErrorCode PetscSFView_Window(PetscSF sf, PetscViewer viewer)
738: {
739:   PetscSF_Window   *w = (PetscSF_Window *)sf->data;
740:   PetscBool         iascii;
741:   PetscViewerFormat format;

743:   PetscViewerGetFormat(viewer, &format);
744:   PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii);
745:   if (iascii) {
746:     PetscViewerASCIIPrintf(viewer, "  current flavor=%s synchronization=%s MultiSF sort=%s\n", PetscSFWindowFlavorTypes[w->flavor], PetscSFWindowSyncTypes[w->sync], sf->rankorder ? "rank-order" : "unordered");
747:     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
748:       if (w->info != MPI_INFO_NULL) {
749:         PetscMPIInt k, nkeys;
750:         char        key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL];

752:         MPI_Info_get_nkeys(w->info, &nkeys);
753:         PetscViewerASCIIPrintf(viewer, "    current info with %d keys. Ordered key-value pairs follow:\n", nkeys);
754:         for (k = 0; k < nkeys; k++) {
755:           PetscMPIInt flag;

757:           MPI_Info_get_nthkey(w->info, k, key);
758:           MPI_Info_get(w->info, key, MPI_MAX_INFO_VAL, value, &flag);
760:           PetscViewerASCIIPrintf(viewer, "      %s = %s\n", key, value);
761:         }
762:       } else {
763:         PetscViewerASCIIPrintf(viewer, "    current info=MPI_INFO_NULL\n");
764:       }
765:     }
766:   }
767:   return 0;
768: }

770: static PetscErrorCode PetscSFDuplicate_Window(PetscSF sf, PetscSFDuplicateOption opt, PetscSF newsf)
771: {
772:   PetscSF_Window       *w = (PetscSF_Window *)sf->data;
773:   PetscSFWindowSyncType synctype;

775:   synctype = w->sync;
776:   /* HACK: Must use FENCE or LOCK when called from PetscSFGetGroups() because ACTIVE here would cause recursion. */
777:   if (!sf->setupcalled) synctype = PETSCSF_WINDOW_SYNC_LOCK;
778:   PetscSFWindowSetSyncType(newsf, synctype);
779:   PetscSFWindowSetFlavorType(newsf, w->flavor);
780:   PetscSFWindowSetInfo(newsf, w->info);
781:   return 0;
782: }

784: static PetscErrorCode PetscSFBcastBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, void *leafdata, MPI_Op op)
785: {
786:   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
787:   PetscInt            i, nranks;
788:   const PetscMPIInt  *ranks;
789:   const MPI_Aint     *target_disp;
790:   const MPI_Datatype *mine, *remote;
791:   MPI_Request        *reqs;
792:   MPI_Win             win;

795:   PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL);
796:   PetscSFWindowGetDataTypes(sf, unit, &mine, &remote);
797:   PetscSFGetWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, MPI_MODE_NOPUT, 0, &target_disp, &reqs, &win);
798:   for (i = 0; i < nranks; i++) {
799:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

801:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
802:       MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win);
803: #if defined(PETSC_HAVE_MPI_RGET)
804:       MPI_Rget(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win, &reqs[i]);
805: #else
806:       MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win);
807: #endif
808:     } else {
809:       MPI_Get(leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], win);
810:     }
811:   }
812:   return 0;
813: }

815: PetscErrorCode PetscSFBcastEnd_Window(PetscSF sf, MPI_Datatype unit, const void *rootdata, void *leafdata, MPI_Op op)
816: {
817:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
818:   MPI_Win         win;
819:   MPI_Request    *reqs = NULL;

821:   PetscSFFindWindow(sf, unit, rootdata, &win, &reqs);
822:   if (reqs) MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE);
823:   if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) {
824:     PetscInt           i, nranks;
825:     const PetscMPIInt *ranks;

827:     PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL);
828:     for (i = 0; i < nranks; i++) MPI_Win_unlock(ranks[i], win);
829:   }
830:   PetscSFRestoreWindow(sf, unit, (void *)rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSTORE | MPI_MODE_NOSUCCEED, PETSC_FALSE, &win);
831:   return 0;
832: }

834: PetscErrorCode PetscSFReduceBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType leafmtype, const void *leafdata, PetscMemType rootmtype, void *rootdata, MPI_Op op)
835: {
836:   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
837:   PetscInt            i, nranks;
838:   const PetscMPIInt  *ranks;
839:   const MPI_Aint     *target_disp;
840:   const MPI_Datatype *mine, *remote;
841:   MPI_Win             win;

843:   PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL);
844:   PetscSFWindowGetDataTypes(sf, unit, &mine, &remote);
845:   PetscSFWindowOpTranslate(&op);
846:   PetscSFGetWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win);
847:   for (i = 0; i < nranks; i++) {
848:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

850:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], MPI_MODE_NOCHECK, win);
851:     MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win);
852:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_unlock(ranks[i], win);
853:   }
854:   return 0;
855: }

857: static PetscErrorCode PetscSFReduceEnd_Window(PetscSF sf, MPI_Datatype unit, const void *leafdata, void *rootdata, MPI_Op op)
858: {
859:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
860:   MPI_Win         win;
861:   MPI_Request    *reqs = NULL;

863:   PetscSFFindWindow(sf, unit, rootdata, &win, &reqs);
864:   if (reqs) MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE);
865:   PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win);
866:   return 0;
867: }

869: static PetscErrorCode PetscSFFetchAndOpBegin_Window(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, void *rootdata, PetscMemType leafmtype, const void *leafdata, void *leafupdate, MPI_Op op)
870: {
871:   PetscInt            i, nranks;
872:   const PetscMPIInt  *ranks;
873:   const MPI_Datatype *mine, *remote;
874:   const MPI_Aint     *target_disp;
875:   MPI_Win             win;
876:   PetscSF_Window     *w = (PetscSF_Window *)sf->data;
877: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
878:   PetscSFWindowFlavorType oldf;
879: #endif

881:   PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL);
882:   PetscSFWindowGetDataTypes(sf, unit, &mine, &remote);
883:   PetscSFWindowOpTranslate(&op);
884: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
885:   /* FetchAndOp without MPI_Get_Accumulate requires locking.
886:      we create a new window every time to not interfere with user-defined MPI_Info which may have used "no_locks"="true" */
887:   oldf      = w->flavor;
888:   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
889:   PetscSFGetWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, 0, 0, &target_disp, NULL, &win);
890: #else
891:   PetscSFGetWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOPRECEDE, 0, 0, &target_disp, NULL, &win);
892: #endif
893:   for (i = 0; i < nranks; i++) {
894:     MPI_Aint tdp = target_disp ? target_disp[i] : 0;

896: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
897:     MPI_Win_lock(MPI_LOCK_EXCLUSIVE, ranks[i], 0, win);
898:     MPI_Get(leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], win);
899:     MPI_Accumulate((void *)leafdata, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win);
900:     MPI_Win_unlock(ranks[i], win);
901: #else
902:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_lock(MPI_LOCK_SHARED, ranks[i], 0, win);
903:     MPI_Get_accumulate((void *)leafdata, 1, mine[i], leafupdate, 1, mine[i], ranks[i], tdp, 1, remote[i], op, win);
904:     if (w->sync == PETSCSF_WINDOW_SYNC_LOCK) MPI_Win_unlock(ranks[i], win);
905: #endif
906:   }
907: #if !defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
908:   w->flavor = oldf;
909: #endif
910:   return 0;
911: }

913: static PetscErrorCode PetscSFFetchAndOpEnd_Window(PetscSF sf, MPI_Datatype unit, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
914: {
915:   MPI_Win win;
916: #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
917:   PetscSF_Window *w = (PetscSF_Window *)sf->data;
918: #endif
919:   MPI_Request *reqs = NULL;

921:   PetscSFFindWindow(sf, unit, rootdata, &win, &reqs);
922:   if (reqs) MPI_Waitall(sf->nranks, reqs, MPI_STATUSES_IGNORE);
923: #if defined(PETSC_HAVE_MPI_GET_ACCUMULATE)
924:   PetscSFRestoreWindow(sf, unit, rootdata, w->sync, PETSC_TRUE, MPI_MODE_NOSUCCEED, PETSC_TRUE, &win);
925: #else
926:   PetscSFRestoreWindow(sf, unit, rootdata, PETSCSF_WINDOW_SYNC_LOCK, PETSC_FALSE, 0, PETSC_TRUE, &win);
927: #endif
928:   return 0;
929: }

931: PETSC_INTERN PetscErrorCode PetscSFCreate_Window(PetscSF sf)
932: {
933:   PetscSF_Window *w = (PetscSF_Window *)sf->data;

935:   sf->ops->SetUp           = PetscSFSetUp_Window;
936:   sf->ops->SetFromOptions  = PetscSFSetFromOptions_Window;
937:   sf->ops->Reset           = PetscSFReset_Window;
938:   sf->ops->Destroy         = PetscSFDestroy_Window;
939:   sf->ops->View            = PetscSFView_Window;
940:   sf->ops->Duplicate       = PetscSFDuplicate_Window;
941:   sf->ops->BcastBegin      = PetscSFBcastBegin_Window;
942:   sf->ops->BcastEnd        = PetscSFBcastEnd_Window;
943:   sf->ops->ReduceBegin     = PetscSFReduceBegin_Window;
944:   sf->ops->ReduceEnd       = PetscSFReduceEnd_Window;
945:   sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Window;
946:   sf->ops->FetchAndOpEnd   = PetscSFFetchAndOpEnd_Window;

948:   PetscNew(&w);
949:   sf->data  = (void *)w;
950:   w->sync   = PETSCSF_WINDOW_SYNC_FENCE;
951:   w->flavor = PETSCSF_WINDOW_FLAVOR_CREATE;
952:   w->info   = MPI_INFO_NULL;

954:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetSyncType_C", PetscSFWindowSetSyncType_Window);
955:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetSyncType_C", PetscSFWindowGetSyncType_Window);
956:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetFlavorType_C", PetscSFWindowSetFlavorType_Window);
957:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetFlavorType_C", PetscSFWindowGetFlavorType_Window);
958:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowSetInfo_C", PetscSFWindowSetInfo_Window);
959:   PetscObjectComposeFunction((PetscObject)sf, "PetscSFWindowGetInfo_C", PetscSFWindowGetInfo_Window);

961: #if defined(OMPI_MAJOR_VERSION) && (OMPI_MAJOR_VERSION < 1 || (OMPI_MAJOR_VERSION == 1 && OMPI_MINOR_VERSION <= 6))
962:   {
963:     PetscBool ackbug = PETSC_FALSE;
964:     PetscOptionsGetBool(NULL, NULL, "-acknowledge_ompi_onesided_bug", &ackbug, NULL);
965:     if (ackbug) {
966:       PetscInfo(sf, "Acknowledged Open MPI bug, proceeding anyway. Expect memory corruption.\n");
967:     } else SETERRQ(PetscObjectComm((PetscObject)sf), PETSC_ERR_LIB, "Open MPI is known to be buggy (https://svn.open-mpi.org/trac/ompi/ticket/1905 and 2656), use -acknowledge_ompi_onesided_bug to proceed");
968:   }
969: #endif
970:   return 0;
971: }