Actual source code: sfneighbor.c

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  3: #include <petscpkg_version.h>

  5: /* Convenience local types and wrappers */
  6: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
  7: typedef MPI_Count PetscSFCount;
  8: typedef MPI_Aint  PetscSFAint;
  9:   #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)            MPI_Neighbor_alltoallv_c(a, b, c, d, e, f, g, h, i)
 10:   #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init_c(a, b, c, d, e, f, g, h, i, j, k)
 11:   #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)        MPI_Ineighbor_alltoallv_c(a, b, c, d, e, f, g, h, i, j)
 12: #else
 13: typedef PetscMPIInt PetscSFCount;
 14: typedef PetscMPIInt PetscSFAint;
 15:   #define MPIU_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)            MPI_Neighbor_alltoallv(a, b, c, d, e, f, g, h, i)
 16:   #define MPIU_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k) MPI_Neighbor_alltoallv_init(a, b, c, d, e, f, g, h, i, j, k)
 17:   #define MPIU_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)        MPI_Ineighbor_alltoallv(a, b, c, d, e, f, g, h, i, j)
 18: #endif

 20: typedef struct {
 21:   SFBASICHEADER;
 22:   MPI_Comm      comms[2];                /* Communicators with distributed topology in both directions */
 23:   PetscBool     initialized[2];          /* Are the two communicators initialized? */
 24:   PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
 25:   PetscSFAint  *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
 26:   PetscMPIInt  *rootweights, *leafweights;
 27:   PetscInt      rootdegree, leafdegree;
 28: } PetscSF_Neighbor;

 30: /*===================================================================================*/
 31: /*              Internal utility routines                                            */
 32: /*===================================================================================*/

 34: static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
 35: {
 36:   PetscFunctionBegin;
 37:   if (PetscDefined(USE_LOG)) {
 38:     petsc_isend_ct += (PetscLogDouble)nsend;
 39:     petsc_irecv_ct += (PetscLogDouble)nrecv;

 41:     if (sendtype != MPI_DATATYPE_NULL) {
 42:       PetscMPIInt i, typesize;
 43:       PetscCallMPI(MPI_Type_size(sendtype, &typesize));
 44:       for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
 45:     }

 47:     if (recvtype != MPI_DATATYPE_NULL) {
 48:       PetscMPIInt i, typesize;
 49:       PetscCallMPI(MPI_Type_size(recvtype, &typesize));
 50:       for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
 51:     }
 52:   }
 53:   PetscFunctionReturn(PETSC_SUCCESS);
 54: }

 56: /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
 57: static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
 58: {
 59:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;

 61:   PetscFunctionBegin;
 62:   if (!dat->initialized[direction]) {
 63:     PetscMPIInt        nrootranks, ndrootranks, nleafranks, ndleafranks;
 64:     PetscMPIInt        indegree, outdegree;
 65:     const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
 66:     MPI_Comm           comm, *mycomm = &dat->comms[direction];

 68:     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL));       /* Which ranks will access my roots (I am a destination) */
 69:     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
 70:     indegree     = nrootranks - ndrootranks;
 71:     outdegree    = nleafranks - ndleafranks;
 72:     sources      = PetscSafePointerPlusOffset(rootranks, ndrootranks);
 73:     destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks);
 74:     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
 75:     if (direction == PETSCSF_LEAF2ROOT) {
 76:       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
 77:     } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
 78:       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
 79:     }
 80:     dat->initialized[direction] = PETSC_TRUE;
 81:   }
 82:   *distcomm = dat->comms[direction];
 83:   PetscFunctionReturn(PETSC_SUCCESS);
 84: }

 86: // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
 87: static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
 88: {
 89:   PetscSF_Neighbor *dat      = (PetscSF_Neighbor *)sf->data;
 90:   MPI_Comm          distcomm = MPI_COMM_NULL;
 91:   void             *rootbuf = NULL, *leafbuf = NULL;
 92:   MPI_Request      *req = NULL;

 94:   PetscFunctionBegin;
 95:   if (direction == PETSCSF_ROOT2LEAF) {
 96:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
 97:   } else {
 98:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
 99:   }

101:   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
102:   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
103:   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));

105:   if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
106:     if (direction == PETSCSF_ROOT2LEAF) {
107:       PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
108:       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
109:     } else {
110:       PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
111:       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
112:     }
113:   }
114:   PetscFunctionReturn(PETSC_SUCCESS);
115: }

117: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
118: static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
119: {
120:   PetscSF_Neighbor  *dat           = (PetscSF_Neighbor *)sf->data;
121:   MPI_Comm           distcomm      = MPI_COMM_NULL;
122:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
123:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi;
124:   MPI_Request       *req            = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
125:   void              *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
126:   MPI_Info           info;

128:   PetscFunctionBegin;
129:   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
130:   if (dat->rootdegree || dat->leafdegree) {
131:     if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
132:       PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
133:       if (direction == PETSCSF_ROOT2LEAF) {
134:         PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
135:       } else {
136:         PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
137:       }
138:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
139:       PetscCallMPI(MPI_Info_free(&info));
140:     }
141:   }
142:   PetscFunctionReturn(PETSC_SUCCESS);
143: }

145: // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
146: static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
147: {
148:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
149:   MPI_Request      *req = NULL;

151:   PetscFunctionBegin;
152:   if (direction == PETSCSF_ROOT2LEAF) {
153:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
154:   } else {
155:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
156:   }

158:   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
159:   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
160:   if (dat->rootdegree || dat->leafdegree) {
161:     PetscCallMPI(MPI_Start(req));
162:     if (direction == PETSCSF_ROOT2LEAF) {
163:       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
164:     } else {
165:       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
166:     }
167:   }
168:   PetscFunctionReturn(PETSC_SUCCESS);
169: }
170: #endif

172: static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
173: {
174:   PetscFunctionBegin;
175: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
176:   if (sf->persistent) {
177:     link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
178:     link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
179:   } else
180: #endif
181:   {
182:     link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
183:   }
184:   PetscFunctionReturn(PETSC_SUCCESS);
185: }

187: /*===================================================================================*/
188: /*              Implementations of SF public APIs                                    */
189: /*===================================================================================*/
190: static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
191: {
192:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
193:   PetscMPIInt       nrootranks, ndrootranks, nleafranks, ndleafranks;
194:   const PetscInt   *rootoffset, *leafoffset;
195:   PetscMPIInt       m, n, m2, n2;

197:   PetscFunctionBegin;
198:   /* SFNeighbor inherits from Basic */
199:   PetscCall(PetscSFSetUp_Basic(sf));
200:   /* SFNeighbor specific */
201:   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
202:   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
203:   dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
204:   dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
205:   sf->nleafreqs       = 0;
206:   dat->nrootreqs      = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]

208:   m2 = m;
209:   n2 = n;
210: #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614
211:   #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3)
212:   m2 = m ? m : 1;
213:   n2 = n ? n : 1;
214:   #endif
215: #endif
216:   // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory
217:   PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights));

219: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
220:   for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
221:     dat->rootdispls[j]  = rootoffset[i] - rootoffset[ndrootranks];
222:     dat->rootcounts[j]  = rootoffset[i + 1] - rootoffset[i];
223:     dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647); /* Scale to range of PetscMPIInt */
224:   }

226:   for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
227:     dat->leafdispls[j]  = leafoffset[i] - leafoffset[ndleafranks];
228:     dat->leafcounts[j]  = leafoffset[i + 1] - leafoffset[i];
229:     dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_INT_MAX * 2147483647);
230:   }
231: #else
232:   for (PetscMPIInt i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
233:     PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
234:     dat->rootdispls[j] = m;
235:     PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
236:     dat->rootcounts[j]  = n;
237:     dat->rootweights[j] = n;
238:   }

240:   for (PetscMPIInt i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
241:     PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
242:     dat->leafdispls[j] = m;
243:     PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
244:     dat->leafcounts[j]  = n;
245:     dat->leafweights[j] = n;
246:   }
247: #endif
248:   PetscFunctionReturn(PETSC_SUCCESS);
249: }

251: static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
252: {
253:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;

255:   PetscFunctionBegin;
256:   PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
257:   PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
258:   for (int i = 0; i < 2; i++) {
259:     if (dat->initialized[i]) {
260:       PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
261:       dat->initialized[i] = PETSC_FALSE;
262:     }
263:   }
264:   PetscCall(PetscSFReset_Basic(sf)); /* Common part */
265:   PetscFunctionReturn(PETSC_SUCCESS);
266: }

268: static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
269: {
270:   PetscFunctionBegin;
271:   PetscCall(PetscSFReset_Neighbor(sf));
272:   PetscCall(PetscFree(sf->data));
273:   PetscFunctionReturn(PETSC_SUCCESS);
274: }

276: PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
277: {
278:   PetscSF_Neighbor *dat;

280:   PetscFunctionBegin;
281:   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
282:   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
283:   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
284:   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
285:   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
286:   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
287:   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
288:   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
289:   sf->ops->View                 = PetscSFView_Basic;

291:   sf->ops->SetUp               = PetscSFSetUp_Neighbor;
292:   sf->ops->Reset               = PetscSFReset_Neighbor;
293:   sf->ops->Destroy             = PetscSFDestroy_Neighbor;
294:   sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;

296: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
297:   PetscObjectOptionsBegin((PetscObject)sf);
298:   PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
299:   PetscOptionsEnd();
300: #endif
301:   sf->collective = PETSC_TRUE;

303:   PetscCall(PetscNew(&dat));
304:   sf->data = (void *)dat;
305:   PetscFunctionReturn(PETSC_SUCCESS);
306: }