Actual source code: sfneighbor.c
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3: #include <petscpkg_version.h>
5: /* Convenience local types */
6: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
7: typedef MPI_Count PetscSFCount;
8: typedef MPI_Aint PetscSFAint;
9: #else
10: typedef PetscMPIInt PetscSFCount;
11: typedef PetscMPIInt PetscSFAint;
12: #endif
14: typedef struct {
15: SFBASICHEADER;
16: MPI_Comm comms[2]; /* Communicators with distributed topology in both directions */
17: PetscBool initialized[2]; /* Are the two communicators initialized? */
18: PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
19: PetscSFAint *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
20: PetscMPIInt *rootweights, *leafweights;
21: PetscInt rootdegree, leafdegree;
22: } PetscSF_Neighbor;
24: /*===================================================================================*/
25: /* Internal utility routines */
26: /*===================================================================================*/
28: static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
29: {
30: PetscFunctionBegin;
31: if (PetscDefined(USE_LOG)) {
32: petsc_isend_ct += (PetscLogDouble)nsend;
33: petsc_irecv_ct += (PetscLogDouble)nrecv;
35: if (sendtype != MPI_DATATYPE_NULL) {
36: PetscMPIInt i, typesize;
37: PetscCallMPI(MPI_Type_size(sendtype, &typesize));
38: for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
39: }
41: if (recvtype != MPI_DATATYPE_NULL) {
42: PetscMPIInt i, typesize;
43: PetscCallMPI(MPI_Type_size(recvtype, &typesize));
44: for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
45: }
46: }
47: PetscFunctionReturn(PETSC_SUCCESS);
48: }
50: /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
51: static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
52: {
53: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
55: PetscFunctionBegin;
56: if (!dat->initialized[direction]) {
57: PetscInt nrootranks, ndrootranks, nleafranks, ndleafranks;
58: PetscMPIInt indegree, outdegree;
59: const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
60: MPI_Comm comm, *mycomm = &dat->comms[direction];
62: PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL)); /* Which ranks will access my roots (I am a destination) */
63: PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
64: indegree = nrootranks - ndrootranks;
65: outdegree = nleafranks - ndleafranks;
66: sources = PetscSafePointerPlusOffset(rootranks, ndrootranks);
67: destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks);
68: PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
69: if (direction == PETSCSF_LEAF2ROOT) {
70: PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
71: } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
72: PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
73: }
74: dat->initialized[direction] = PETSC_TRUE;
75: }
76: *distcomm = dat->comms[direction];
77: PetscFunctionReturn(PETSC_SUCCESS);
78: }
80: // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
81: static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
82: {
83: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
84: MPI_Comm distcomm = MPI_COMM_NULL;
85: void *rootbuf = NULL, *leafbuf = NULL;
86: MPI_Request *req = NULL;
88: PetscFunctionBegin;
89: if (direction == PETSCSF_ROOT2LEAF) {
90: PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
91: } else {
92: PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
93: }
95: PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
96: PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
97: PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
99: if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
100: if (direction == PETSCSF_ROOT2LEAF) {
101: PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
102: PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
103: } else {
104: PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
105: PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
106: }
107: }
108: PetscFunctionReturn(PETSC_SUCCESS);
109: }
111: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
112: static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
113: {
114: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
115: MPI_Comm distcomm = MPI_COMM_NULL;
116: const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
117: const PetscInt rootdirect_mpi = link->rootdirect_mpi;
118: MPI_Request *req = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
119: void *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
120: MPI_Info info;
122: PetscFunctionBegin;
123: PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
124: if (dat->rootdegree || dat->leafdegree) {
125: if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
126: PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
127: if (direction == PETSCSF_ROOT2LEAF) {
128: PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
129: } else {
130: PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
131: }
132: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
133: PetscCallMPI(MPI_Info_free(&info));
134: }
135: }
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
140: static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
141: {
142: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
143: MPI_Request *req = NULL;
145: PetscFunctionBegin;
146: if (direction == PETSCSF_ROOT2LEAF) {
147: PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
148: } else {
149: PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
150: }
152: PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
153: PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
154: if (dat->rootdegree || dat->leafdegree) {
155: PetscCallMPI(MPI_Start(req));
156: if (direction == PETSCSF_ROOT2LEAF) {
157: PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
158: } else {
159: PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
160: }
161: }
162: PetscFunctionReturn(PETSC_SUCCESS);
163: }
164: #endif
166: static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
167: {
168: PetscFunctionBegin;
169: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
170: if (sf->persistent) {
171: link->InitMPIRequests = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
172: link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
173: } else
174: #endif
175: {
176: link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
177: }
178: PetscFunctionReturn(PETSC_SUCCESS);
179: }
181: /*===================================================================================*/
182: /* Implementations of SF public APIs */
183: /*===================================================================================*/
184: static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
185: {
186: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
187: PetscInt i, j, nrootranks, ndrootranks, nleafranks, ndleafranks;
188: const PetscInt *rootoffset, *leafoffset;
189: PetscMPIInt m, n, m2, n2;
191: PetscFunctionBegin;
192: /* SFNeighbor inherits from Basic */
193: PetscCall(PetscSFSetUp_Basic(sf));
194: /* SFNeighbor specific */
195: PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
196: PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
197: dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
198: dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
199: sf->nleafreqs = 0;
200: dat->nrootreqs = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]
202: m2 = m;
203: n2 = n;
204: #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614
205: #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3)
206: m2 = m ? m : 1;
207: n2 = n ? n : 1;
208: #endif
209: #endif
210: // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory
211: PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights));
213: #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
214: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
215: dat->rootdispls[j] = rootoffset[i] - rootoffset[ndrootranks];
216: dat->rootcounts[j] = rootoffset[i + 1] - rootoffset[i];
217: dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */
218: }
220: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
221: dat->leafdispls[j] = leafoffset[i] - leafoffset[ndleafranks];
222: dat->leafcounts[j] = leafoffset[i + 1] - leafoffset[i];
223: dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647);
224: }
225: #else
226: for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
227: PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
228: dat->rootdispls[j] = m;
229: PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
230: dat->rootcounts[j] = n;
231: dat->rootweights[j] = n;
232: }
234: for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
235: PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
236: dat->leafdispls[j] = m;
237: PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
238: dat->leafcounts[j] = n;
239: dat->leafweights[j] = n;
240: }
241: #endif
242: PetscFunctionReturn(PETSC_SUCCESS);
243: }
245: static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
246: {
247: PetscInt i;
248: PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
250: PetscFunctionBegin;
251: PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
252: PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
253: for (i = 0; i < 2; i++) {
254: if (dat->initialized[i]) {
255: PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
256: dat->initialized[i] = PETSC_FALSE;
257: }
258: }
259: PetscCall(PetscSFReset_Basic(sf)); /* Common part */
260: PetscFunctionReturn(PETSC_SUCCESS);
261: }
263: static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
264: {
265: PetscFunctionBegin;
266: PetscCall(PetscSFReset_Neighbor(sf));
267: PetscCall(PetscFree(sf->data));
268: PetscFunctionReturn(PETSC_SUCCESS);
269: }
271: PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
272: {
273: PetscSF_Neighbor *dat;
275: PetscFunctionBegin;
276: sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
277: sf->ops->BcastBegin = PetscSFBcastBegin_Basic;
278: sf->ops->BcastEnd = PetscSFBcastEnd_Basic;
279: sf->ops->ReduceBegin = PetscSFReduceBegin_Basic;
280: sf->ops->ReduceEnd = PetscSFReduceEnd_Basic;
281: sf->ops->FetchAndOpBegin = PetscSFFetchAndOpBegin_Basic;
282: sf->ops->FetchAndOpEnd = PetscSFFetchAndOpEnd_Basic;
283: sf->ops->GetLeafRanks = PetscSFGetLeafRanks_Basic;
284: sf->ops->View = PetscSFView_Basic;
286: sf->ops->SetUp = PetscSFSetUp_Neighbor;
287: sf->ops->Reset = PetscSFReset_Neighbor;
288: sf->ops->Destroy = PetscSFDestroy_Neighbor;
289: sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;
291: #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
292: PetscObjectOptionsBegin((PetscObject)sf);
293: PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
294: PetscOptionsEnd();
295: #endif
296: sf->collective = PETSC_TRUE;
298: PetscCall(PetscNew(&dat));
299: sf->data = (void *)dat;
300: PetscFunctionReturn(PETSC_SUCCESS);
301: }