Actual source code: sfneighbor.c

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
  3: #include <petscpkg_version.h>

  5: /* Convenience local types */
  6: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
  7: typedef MPI_Count PetscSFCount;
  8: typedef MPI_Aint  PetscSFAint;
  9: #else
 10: typedef PetscMPIInt PetscSFCount;
 11: typedef PetscMPIInt PetscSFAint;
 12: #endif

 14: typedef struct {
 15:   SFBASICHEADER;
 16:   MPI_Comm      comms[2];                /* Communicators with distributed topology in both directions */
 17:   PetscBool     initialized[2];          /* Are the two communicators initialized? */
 18:   PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
 19:   PetscSFAint  *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
 20:   PetscMPIInt  *rootweights, *leafweights;
 21:   PetscInt      rootdegree, leafdegree;
 22: } PetscSF_Neighbor;

 24: /*===================================================================================*/
 25: /*              Internal utility routines                                            */
 26: /*===================================================================================*/

 28: static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
 29: {
 30:   PetscFunctionBegin;
 31:   if (PetscDefined(USE_LOG)) {
 32:     petsc_isend_ct += (PetscLogDouble)nsend;
 33:     petsc_irecv_ct += (PetscLogDouble)nrecv;

 35:     if (sendtype != MPI_DATATYPE_NULL) {
 36:       PetscMPIInt i, typesize;
 37:       PetscCallMPI(MPI_Type_size(sendtype, &typesize));
 38:       for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
 39:     }

 41:     if (recvtype != MPI_DATATYPE_NULL) {
 42:       PetscMPIInt i, typesize;
 43:       PetscCallMPI(MPI_Type_size(recvtype, &typesize));
 44:       for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
 45:     }
 46:   }
 47:   PetscFunctionReturn(PETSC_SUCCESS);
 48: }

 50: /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
 51: static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
 52: {
 53:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;

 55:   PetscFunctionBegin;
 56:   if (!dat->initialized[direction]) {
 57:     PetscInt           nrootranks, ndrootranks, nleafranks, ndleafranks;
 58:     PetscMPIInt        indegree, outdegree;
 59:     const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
 60:     MPI_Comm           comm, *mycomm = &dat->comms[direction];

 62:     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL));       /* Which ranks will access my roots (I am a destination) */
 63:     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
 64:     indegree     = nrootranks - ndrootranks;
 65:     outdegree    = nleafranks - ndleafranks;
 66:     sources      = PetscSafePointerPlusOffset(rootranks, ndrootranks);
 67:     destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks);
 68:     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
 69:     if (direction == PETSCSF_LEAF2ROOT) {
 70:       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
 71:     } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
 72:       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
 73:     }
 74:     dat->initialized[direction] = PETSC_TRUE;
 75:   }
 76:   *distcomm = dat->comms[direction];
 77:   PetscFunctionReturn(PETSC_SUCCESS);
 78: }

 80: // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
 81: static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
 82: {
 83:   PetscSF_Neighbor *dat      = (PetscSF_Neighbor *)sf->data;
 84:   MPI_Comm          distcomm = MPI_COMM_NULL;
 85:   void             *rootbuf = NULL, *leafbuf = NULL;
 86:   MPI_Request      *req = NULL;

 88:   PetscFunctionBegin;
 89:   if (direction == PETSCSF_ROOT2LEAF) {
 90:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
 91:   } else {
 92:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
 93:   }

 95:   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
 96:   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
 97:   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));

 99:   if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
100:     if (direction == PETSCSF_ROOT2LEAF) {
101:       PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
102:       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
103:     } else {
104:       PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
105:       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
106:     }
107:   }
108:   PetscFunctionReturn(PETSC_SUCCESS);
109: }

111: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
112: static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
113: {
114:   PetscSF_Neighbor  *dat           = (PetscSF_Neighbor *)sf->data;
115:   MPI_Comm           distcomm      = MPI_COMM_NULL;
116:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
117:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi;
118:   MPI_Request       *req            = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
119:   void              *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
120:   MPI_Info           info;

122:   PetscFunctionBegin;
123:   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
124:   if (dat->rootdegree || dat->leafdegree) {
125:     if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
126:       PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
127:       if (direction == PETSCSF_ROOT2LEAF) {
128:         PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
129:       } else {
130:         PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
131:       }
132:       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
133:       PetscCallMPI(MPI_Info_free(&info));
134:     }
135:   }
136:   PetscFunctionReturn(PETSC_SUCCESS);
137: }

139: // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
140: static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
141: {
142:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
143:   MPI_Request      *req = NULL;

145:   PetscFunctionBegin;
146:   if (direction == PETSCSF_ROOT2LEAF) {
147:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
148:   } else {
149:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
150:   }

152:   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
153:   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
154:   if (dat->rootdegree || dat->leafdegree) {
155:     PetscCallMPI(MPI_Start(req));
156:     if (direction == PETSCSF_ROOT2LEAF) {
157:       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
158:     } else {
159:       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
160:     }
161:   }
162:   PetscFunctionReturn(PETSC_SUCCESS);
163: }
164: #endif

166: static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
167: {
168:   PetscFunctionBegin;
169: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
170:   if (sf->persistent) {
171:     link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
172:     link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
173:   } else
174: #endif
175:   {
176:     link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
177:   }
178:   PetscFunctionReturn(PETSC_SUCCESS);
179: }

181: /*===================================================================================*/
182: /*              Implementations of SF public APIs                                    */
183: /*===================================================================================*/
184: static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
185: {
186:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
187:   PetscInt          i, j, nrootranks, ndrootranks, nleafranks, ndleafranks;
188:   const PetscInt   *rootoffset, *leafoffset;
189:   PetscMPIInt       m, n, m2, n2;

191:   PetscFunctionBegin;
192:   /* SFNeighbor inherits from Basic */
193:   PetscCall(PetscSFSetUp_Basic(sf));
194:   /* SFNeighbor specific */
195:   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
196:   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
197:   dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
198:   dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
199:   sf->nleafreqs       = 0;
200:   dat->nrootreqs      = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]

202:   m2 = m;
203:   n2 = n;
204: #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614
205:   #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3)
206:   m2 = m ? m : 1;
207:   n2 = n ? n : 1;
208:   #endif
209: #endif
210:   // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory
211:   PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights));

213: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
214:   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
215:     dat->rootdispls[j]  = rootoffset[i] - rootoffset[ndrootranks];
216:     dat->rootcounts[j]  = rootoffset[i + 1] - rootoffset[i];
217:     dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */
218:   }

220:   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
221:     dat->leafdispls[j]  = leafoffset[i] - leafoffset[ndleafranks];
222:     dat->leafcounts[j]  = leafoffset[i + 1] - leafoffset[i];
223:     dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647);
224:   }
225: #else
226:   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
227:     PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
228:     dat->rootdispls[j] = m;
229:     PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
230:     dat->rootcounts[j]  = n;
231:     dat->rootweights[j] = n;
232:   }

234:   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
235:     PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
236:     dat->leafdispls[j] = m;
237:     PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
238:     dat->leafcounts[j]  = n;
239:     dat->leafweights[j] = n;
240:   }
241: #endif
242:   PetscFunctionReturn(PETSC_SUCCESS);
243: }

245: static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
246: {
247:   PetscInt          i;
248:   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;

250:   PetscFunctionBegin;
251:   PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
252:   PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
253:   for (i = 0; i < 2; i++) {
254:     if (dat->initialized[i]) {
255:       PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
256:       dat->initialized[i] = PETSC_FALSE;
257:     }
258:   }
259:   PetscCall(PetscSFReset_Basic(sf)); /* Common part */
260:   PetscFunctionReturn(PETSC_SUCCESS);
261: }

263: static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
264: {
265:   PetscFunctionBegin;
266:   PetscCall(PetscSFReset_Neighbor(sf));
267:   PetscCall(PetscFree(sf->data));
268:   PetscFunctionReturn(PETSC_SUCCESS);
269: }

271: PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
272: {
273:   PetscSF_Neighbor *dat;

275:   PetscFunctionBegin;
276:   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
277:   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
278:   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
279:   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
280:   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
281:   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
282:   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
283:   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
284:   sf->ops->View                 = PetscSFView_Basic;

286:   sf->ops->SetUp               = PetscSFSetUp_Neighbor;
287:   sf->ops->Reset               = PetscSFReset_Neighbor;
288:   sf->ops->Destroy             = PetscSFDestroy_Neighbor;
289:   sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;

291: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
292:   PetscObjectOptionsBegin((PetscObject)sf);
293:   PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
294:   PetscOptionsEnd();
295: #endif
296:   sf->collective = PETSC_TRUE;

298:   PetscCall(PetscNew(&dat));
299:   sf->data = (void *)dat;
300:   PetscFunctionReturn(PETSC_SUCCESS);
301: }