Actual source code: sfimpl.h
1: #pragma once
3: #include <petscvec.h>
4: #include <petscsf.h>
5: #include <petsc/private/deviceimpl.h>
6: #include <petsc/private/petscimpl.h>
8: PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
9: PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
10: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
11: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
12: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
13: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
14: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
15: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
16: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
17: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
18: PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
19: PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
20: PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
21: PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
22: PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
23: PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
25: typedef enum {
26: PETSCSF_ROOT2LEAF = 0,
27: PETSCSF_LEAF2ROOT
28: } PetscSFDirection;
29: typedef enum {
30: PETSCSF_BCAST = 0,
31: PETSCSF_REDUCE,
32: PETSCSF_FETCH
33: } PetscSFOperation;
34: /* When doing device-aware MPI, a backend refers to the SF/device interface */
35: typedef enum {
36: PETSCSF_BACKEND_INVALID = 0,
37: PETSCSF_BACKEND_CUDA,
38: PETSCSF_BACKEND_HIP,
39: PETSCSF_BACKEND_KOKKOS
40: } PetscSFBackend;
42: typedef struct _n_PetscSFLink *PetscSFLink;
44: struct _PetscSFOps {
45: PetscErrorCode (*Reset)(PetscSF);
46: PetscErrorCode (*Destroy)(PetscSF);
47: PetscErrorCode (*SetUp)(PetscSF);
48: PetscErrorCode (*SetFromOptions)(PetscSF, PetscOptionItems *);
49: PetscErrorCode (*View)(PetscSF, PetscViewer);
50: PetscErrorCode (*Duplicate)(PetscSF, PetscSFDuplicateOption, PetscSF);
51: PetscErrorCode (*BcastBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op);
52: PetscErrorCode (*BcastEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op);
53: PetscErrorCode (*ReduceBegin)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *, MPI_Op);
54: PetscErrorCode (*ReduceEnd)(PetscSF, MPI_Datatype, const void *, void *, MPI_Op);
55: PetscErrorCode (*FetchAndOpBegin)(PetscSF, MPI_Datatype, PetscMemType, void *, PetscMemType, const void *, void *, MPI_Op);
56: PetscErrorCode (*FetchAndOpEnd)(PetscSF, MPI_Datatype, void *, const void *, void *, MPI_Op);
57: PetscErrorCode (*BcastToZero)(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, void *); /* For internal use only */
58: PetscErrorCode (*GetRootRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **, const PetscInt **);
59: PetscErrorCode (*GetLeafRanks)(PetscSF, PetscMPIInt *, const PetscMPIInt **, const PetscInt **, const PetscInt **);
60: PetscErrorCode (*CreateLocalSF)(PetscSF, PetscSF *);
61: PetscErrorCode (*GetGraph)(PetscSF, PetscInt *, PetscInt *, const PetscInt **, const PetscSFNode **);
62: PetscErrorCode (*CreateEmbeddedRootSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *);
63: PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF, PetscInt, const PetscInt *, PetscSF *);
64: PetscErrorCode (*SetCommunicationOps)(PetscSF, PetscSFLink);
66: PetscErrorCode (*Malloc)(PetscMemType, size_t, void **);
67: PetscErrorCode (*Free)(PetscMemType, void *);
68: };
70: typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
72: struct _p_PetscSF {
73: PETSCHEADER(struct _PetscSFOps);
74: struct { /* Fields needed to implement VecScatter behavior */
75: PetscInt from_n, to_n; /* Recorded local sizes of the input from/to vectors in VecScatterCreate(). Used subsequently for error checking. */
76: PetscBool beginandendtogether; /* Indicates that the scatter begin and end function are called together, VecScatterEnd() is then treated as a nop */
77: const PetscScalar *xdata; /* Vector data to read from */
78: PetscScalar *ydata; /* Vector data to write to. The two pointers are recorded in VecScatterBegin. Memory is not managed by SF. */
79: PetscSF lsf; /* The local part of the scatter, used in SCATTER_LOCAL. Built on demand. */
80: PetscInt bs; /* Block size, determined by IS passed to VecScatterCreate */
81: MPI_Datatype unit; /* one unit = bs PetscScalars */
82: PetscBool logging; /* Indicate if vscat log events are happening. If yes, avoid duplicated SF logging to have clear -log_view */
83: } vscat;
85: /* Fields for generic PetscSF functionality */
86: PetscInt nroots; /* Number of root vertices on current process (candidates for incoming edges) */
87: PetscInt nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
88: PetscInt *mine; /* Location of leaves in leafdata arrays provided to the communication routines */
89: PetscInt *mine_alloc;
90: PetscInt minleaf, maxleaf;
91: PetscSFNode *remote; /* Remote references to roots for each local leaf */
92: PetscSFNode *remote_alloc;
93: PetscMPIInt nranks; /* Number of ranks owning roots connected to my leaves */
94: PetscMPIInt ndranks; /* Number of ranks in distinguished group holding roots connected to my leaves */
95: PetscMPIInt *ranks; /* List of ranks referenced by "remote" */
96: PetscInt *roffset; /* Array of length nranks+1, offset in rmine/rremote for each rank */
97: PetscInt *rmine; /* Concatenated array holding local indices referencing each remote rank */
98: PetscInt *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */
99: PetscBool monitor; /* monitor the sf communication */
101: /* Some results useful in packing by analyzing rmine[] */
102: PetscInt leafbuflen[2]; /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
103: PetscBool leafcontig[2]; /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
104: PetscInt leafstart[2]; /* ... leafstart[0] and leafstart[1] respectively */
105: PetscSFPackOpt leafpackopt[2]; /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
106: PetscSFPackOpt leafpackopt_d[2]; /* Copy of leafpackopt_d[] on device if needed */
107: PetscBool leafdups[2]; /* Indices in rmine[] for self(0)/remote(1) communication have dups respectively? TRUE implies threads working on them in parallel may have data race. */
109: PetscMPIInt nleafreqs; /* Number of MPI requests for leaves */
110: PetscInt *rremote; /* Concatenated array holding remote indices referenced for each remote rank */
111: PetscBool degreeknown; /* The degree is currently known, do not have to recompute */
112: PetscInt *degree; /* Degree of each of my root vertices */
113: PetscInt *degreetmp; /* Temporary local array for computing degree */
114: PetscBool rankorder; /* Sort ranks for gather and scatter operations */
115: MPI_Group ingroup; /* Group of processes connected to my roots */
116: MPI_Group outgroup; /* Group of processes connected to my leaves */
117: PetscSF multi; /* Internal graph used to implement gather and scatter operations */
118: PetscSF rankssf; /* Internal graph used to implement communications with root ranks */
119: PetscBool graphset; /* Flag indicating that the graph has been set, required before calling communication routines */
120: PetscBool setupcalled; /* Type and communication structures have been set up */
121: PetscSFPattern pattern; /* Pattern of the graph */
122: PetscBool persistent; /* Does this SF use MPI persistent requests for communication */
123: PetscBool collective; /* Is this SF collective? Currently only SFBASIC/SFWINDOW are not collective */
124: PetscLayout map; /* Layout of leaves over all processes when building a patterned graph */
125: PetscBool unknown_input_stream; /* If true, SF does not know which streams root/leafdata is on. Default is false, since we only use petsc default stream */
126: PetscBool use_gpu_aware_mpi; /* If true, SF assumes it can pass GPU pointers to MPI */
127: PetscBool use_stream_aware_mpi; /* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
128: PetscInt maxResidentThreadsPerGPU;
129: PetscBool allow_multi_leaves;
130: PetscSFBackend backend; /* The device backend (if any) SF will use */
131: void *data; /* Pointer to implementation */
133: #if defined(PETSC_HAVE_NVSHMEM)
134: PetscBool use_nvshmem; /* TRY to use nvshmem on cuda devices with this SF when possible */
135: PetscBool use_nvshmem_get; /* If true, use nvshmem_get based protocol, otherwise, use nvshmem_put based protocol */
136: PetscBool checked_nvshmem_eligibility; /* Have we checked eligibility of using NVSHMEM on this sf? */
137: PetscBool setup_nvshmem; /* Have we already set up NVSHMEM related fields below? These fields are built on-demand */
138: PetscInt leafbuflen_rmax; /* max leafbuflen[REMOTE] over comm */
139: PetscInt nRemoteRootRanks; /* nranks - ndranks */
140: PetscInt nRemoteRootRanksMax; /* max nranks-ndranks over comm */
142: /* The following two fields look confusing but actually make sense: They are offsets of buffers at the remote side. We're doing one-sided communication! */
143: PetscInt *rootsigdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its rootsigdisp[i]-th root signal */
144: PetscInt *rootbufdisp; /* [nRemoteRootRanks]. For my i-th remote root rank, I will access its root buf at offset rootbufdisp[i], in <unit> to be set */
146: PetscInt *rootbufdisp_d;
147: PetscInt *rootsigdisp_d; /* Copy of rootsigdisp[] on device */
148: PetscMPIInt *ranks_d; /* Copy of the remote part of (root) ranks[] on device */
149: PetscInt *roffset_d; /* Copy of the remote part of roffset[] on device */
150: #endif
151: #if defined(PETSC_HAVE_MPIX_STREAM)
152: MPIX_Stream mpi_stream;
153: MPI_Comm stream_comm; /* gpu stream aware MPI communicator */
154: #endif
155: };
157: PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
158: PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);
160: PETSC_INTERN PetscErrorCode PetscSFGetDatatypeSize_Internal(MPI_Comm, MPI_Datatype, MPI_Aint *);
162: PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF, PetscSF *);
163: PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF, MPI_Datatype, const void *, void *) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(3, 2) PETSC_ATTRIBUTE_MPI_POINTER_WITH_TYPE(4, 2);
165: PETSC_INTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype, MPI_Datatype *, PetscBool *);
166: PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype, MPI_Datatype, PetscBool *);
167: PETSC_INTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype, MPI_Datatype, PetscInt *);
169: PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_envelope(MPI_Datatype, MPIU_Count *, MPIU_Count *, MPIU_Count *, MPIU_Count *, PetscMPIInt *);
170: PETSC_INTERN PetscErrorCode MPIPetsc_Type_get_contents(MPI_Datatype, MPIU_Count, MPIU_Count, MPIU_Count, MPIU_Count, int *, MPI_Aint *, MPIU_Count *, MPI_Datatype *);
172: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
173: #define MPIU_Ibcast(a, b, c, d, e, req) MPI_Ibcast(a, b, c, d, e, req)
174: #define MPIU_Ireduce(a, b, c, d, e, f, g, req) MPI_Ireduce(a, b, c, d, e, f, g, req)
175: #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req) MPI_Iscatter(a, b, c, d, e, f, g, h, req)
176: #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Iscatterv(a, b, c, d, e, f, g, h, i, req)
177: #define MPIU_Igather(a, b, c, d, e, f, g, h, req) MPI_Igather(a, b, c, d, e, f, g, h, req)
178: #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req) MPI_Igatherv(a, b, c, d, e, f, g, h, i, req)
179: #define MPIU_Iallgather(a, b, c, d, e, f, g, req) MPI_Iallgather(a, b, c, d, e, f, g, req)
180: #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req) MPI_Iallgatherv(a, b, c, d, e, f, g, h, req)
181: #define MPIU_Ialltoall(a, b, c, d, e, f, g, req) MPI_Ialltoall(a, b, c, d, e, f, g, req)
182: #else
183: /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
184: to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
185: */
186: #define MPIU_Ibcast(a, b, c, d, e, req) MPI_Bcast(a, b, c, d, e)
187: #define MPIU_Ireduce(a, b, c, d, e, f, g, req) MPI_Reduce(a, b, c, d, e, f, g)
188: #define MPIU_Iscatter(a, b, c, d, e, f, g, h, req) MPI_Scatter(a, b, c, d, e, f, g, h)
189: #define MPIU_Iscatterv(a, b, c, d, e, f, g, h, i, req) MPI_Scatterv(a, b, c, d, e, f, g, h, i)
190: #define MPIU_Igather(a, b, c, d, e, f, g, h, req) MPI_Gather(a, b, c, d, e, f, g, h)
191: #define MPIU_Igatherv(a, b, c, d, e, f, g, h, i, req) MPI_Gatherv(a, b, c, d, e, f, g, h, i)
192: #define MPIU_Iallgather(a, b, c, d, e, f, g, req) MPI_Allgather(a, b, c, d, e, f, g)
193: #define MPIU_Iallgatherv(a, b, c, d, e, f, g, h, req) MPI_Allgatherv(a, b, c, d, e, f, g, h)
194: #define MPIU_Ialltoall(a, b, c, d, e, f, g, req) MPI_Alltoall(a, b, c, d, e, f, g)
195: #endif
197: PETSC_EXTERN PetscErrorCode VecScatterGetRemoteCount_Private(VecScatter, PetscBool, PetscInt *, PetscInt *);
198: PETSC_EXTERN PetscErrorCode VecScatterGetRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
199: PETSC_EXTERN PetscErrorCode VecScatterGetRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
200: PETSC_EXTERN PetscErrorCode VecScatterRestoreRemote_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
201: PETSC_EXTERN PetscErrorCode VecScatterRestoreRemoteOrdered_Private(VecScatter, PetscBool, PetscMPIInt *, const PetscInt **, const PetscInt **, const PetscMPIInt **, PetscInt *);
203: #if defined(PETSC_HAVE_CUDA)
204: PETSC_EXTERN PetscErrorCode PetscSFMalloc_CUDA(PetscMemType, size_t, void **);
205: PETSC_EXTERN PetscErrorCode PetscSFFree_CUDA(PetscMemType, void *);
206: #endif
207: #if defined(PETSC_HAVE_HIP)
208: PETSC_EXTERN PetscErrorCode PetscSFMalloc_HIP(PetscMemType, size_t, void **);
209: PETSC_EXTERN PetscErrorCode PetscSFFree_HIP(PetscMemType, void *);
210: #endif
211: #if defined(PETSC_HAVE_KOKKOS)
212: PETSC_EXTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType, size_t, void **);
213: PETSC_EXTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType, void *);
214: #endif
216: /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF.
217: Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF.
218: */
219: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS) || defined(PETSC_HAVE_HIP)
220: #define PetscSFMalloc(sf, mtype, sz, ptr) ((*(sf)->ops->Malloc)(mtype, sz, ptr))
221: /* Free memory and set ptr to NULL when succeeded */
222: #define PetscSFFree(sf, mtype, ptr) ((PetscErrorCode)((ptr) && ((*(sf)->ops->Free)(mtype, ptr) || ((ptr) = NULL, PETSC_SUCCESS))))
223: #else
224: /* If pure host code, do with less indirection */
225: #define PetscSFMalloc(sf, mtype, sz, ptr) PetscMalloc(sz, ptr)
226: #define PetscSFFree(sf, mtype, ptr) PetscFree(ptr)
227: #endif