Actual source code: veccupmimpl.h
1: #pragma once
3: #include <petsc/private/vecimpl.h>
4: #include <../src/vec/vec/impls/dvecimpl.h>
6: #if PetscDefined(HAVE_NVSHMEM)
7: PETSC_INTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
8: PETSC_INTERN PetscErrorCode PetscNvshmemMalloc(size_t, void **);
9: PETSC_INTERN PetscErrorCode PetscNvshmemCalloc(size_t, void **);
10: PETSC_INTERN PetscErrorCode PetscNvshmemFree_Private(void *);
11: #define PetscNvshmemFree(ptr) ((PetscErrorCode)((ptr) && (PetscNvshmemFree_Private(ptr) || ((ptr) = PETSC_NULLPTR, PETSC_SUCCESS))))
12: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt, PetscScalar *, const PetscScalar *);
13: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt, PetscReal *, const PetscReal *);
14: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
15: #else
16: #define PetscNvshmemFree(ptr) PETSC_SUCCESS
17: #endif
19: #if defined(__cplusplus) && PetscDefined(HAVE_DEVICE)
20: #include <petsc/private/deviceimpl.h>
21: #include <petsc/private/cupmobject.hpp>
22: #include <petsc/private/cupmblasinterface.hpp>
24: #include <petsc/private/cpp/functional.hpp>
26: #include <limits> // std::numeric_limits
28: namespace Petsc
29: {
31: namespace vec
32: {
34: namespace cupm
35: {
37: namespace impl
38: {
40: namespace
41: {
43: struct no_op {
44: template <typename... T>
45: constexpr PetscErrorCode operator()(T &&...) const noexcept
46: {
47: return PETSC_SUCCESS;
48: }
49: };
51: template <typename T>
52: struct CooPair {
53: using value_type = T;
54: using size_type = PetscCount;
56: value_type *&device;
57: value_type *&host;
58: size_type size;
59: };
61: template <typename U>
62: static constexpr CooPair<U> make_coo_pair(U *&device, U *&host, PetscCount size) noexcept
63: {
64: return {device, host, size};
65: }
67: } // anonymous namespace
69: // forward declarations
70: template <device::cupm::DeviceType>
71: class VecSeq_CUPM;
72: template <device::cupm::DeviceType>
73: class VecMPI_CUPM;
75: // ==========================================================================================
76: // Vec_CUPMBase
77: //
78: // Base class for the VecSeq and VecMPI CUPM implementations. On top of the usual DeviceType
79: // template parameter it also uses CRTP to be able to use values/calls specific to either
80: // VecSeq or VecMPI. This is in effect "inside-out" polymorphism.
81: // ==========================================================================================
82: template <device::cupm::DeviceType T, typename Derived>
83: class Vec_CUPMBase : protected device::cupm::impl::CUPMObject<T> {
84: public:
85: PETSC_CUPMOBJECT_HEADER(T);
87: // ==========================================================================================
88: // Vec_CUPMBase::VectorArray
89: //
90: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
91: // holds the pointer itself provides the implicit conversion operator
92: // ==========================================================================================
93: template <PetscMemType, PetscMemoryAccessMode>
94: class VectorArray;
96: protected:
97: static PetscErrorCode VecView_Debug(Vec v, const char *message = "") noexcept
98: {
99: const auto pobj = PetscObjectCast(v);
100: const auto vimpl = VecIMPLCast(v);
101: const auto vcu = VecCUPMCast(v);
102: PetscMemType mtype;
103: MPI_Comm comm;
105: PetscFunctionBegin;
106: PetscAssertPointer(vimpl, 1);
107: PetscAssertPointer(vcu, 1);
108: PetscCall(PetscObjectGetComm(pobj, &comm));
109: PetscCall(PetscPrintf(comm, "---------- %s ----------\n", message));
110: PetscCall(PetscObjectPrintClassNamePrefixType(pobj, PETSC_VIEWER_STDOUT_(comm)));
111: PetscCall(PetscPrintf(comm, "Address: %p\n", v));
112: PetscCall(PetscPrintf(comm, "Size: %" PetscInt_FMT "\n", v->map->n));
113: PetscCall(PetscPrintf(comm, "Offload mask: %s\n", PetscOffloadMaskToString(v->offloadmask)));
114: PetscCall(PetscPrintf(comm, "Host ptr: %p\n", vimpl->array));
115: PetscCall(PetscPrintf(comm, "Device ptr: %p\n", vcu->array_d));
116: PetscCall(PetscPrintf(comm, "Device alloced ptr: %p\n", vcu->array_allocated_d));
117: PetscCall(PetscCUPMGetMemType(vcu->array_d, &mtype));
118: PetscCall(PetscPrintf(comm, "dptr is device mem? %s\n", PetscBools[static_cast<PetscBool>(PetscMemTypeDevice(mtype))]));
119: PetscFunctionReturn(PETSC_SUCCESS);
120: }
122: // Delete the allocated device array if required and replace it with the given array
123: static PetscErrorCode ResetAllocatedDevicePtr_(PetscDeviceContext, Vec, PetscScalar * = nullptr) noexcept;
124: // Check either the host or device impl pointer is allocated and allocate it if
125: // isn't. CastFunctionType casts the Vec to the required type and returns the pointer
126: template <typename CastFunctionType>
127: static PetscErrorCode VecAllocateCheck_(Vec, void *&, CastFunctionType &&) noexcept;
128: // Check the CUPM part (v->spptr) is allocated, otherwise allocate it
129: static PetscErrorCode VecCUPMAllocateCheck_(Vec) noexcept;
130: // Check the Host part (v->data) is allocated, otherwise allocate it
131: static PetscErrorCode VecIMPLAllocateCheck_(Vec) noexcept;
132: // Check the Host array is allocated, otherwise allocate it
133: static PetscErrorCode HostAllocateCheck_(PetscDeviceContext, Vec) noexcept;
134: // Check the CUPM array is allocated, otherwise allocate it
135: static PetscErrorCode DeviceAllocateCheck_(PetscDeviceContext, Vec) noexcept;
136: // Copy HTOD, allocating device if necessary
137: static PetscErrorCode CopyToDevice_(PetscDeviceContext, Vec, bool = false) noexcept;
138: // Copy DTOH, allocating host if necessary
139: static PetscErrorCode CopyToHost_(PetscDeviceContext, Vec, bool = false) noexcept;
140: static PetscErrorCode DestroyDevice_(Vec) noexcept;
141: static PetscErrorCode DestroyHost_(Vec) noexcept;
143: public:
144: struct Vec_CUPM {
145: PetscScalar *array_d; // gpu data
146: PetscScalar *array_allocated_d; // does PETSc own the array ptr?
147: PetscBool nvshmem; // is array allocated in nvshmem? It is used to allocate
148: // Mvctx->lvec in nvshmem
150: // COO stuff
151: PetscCount *jmap1_d; // [m+1]: i-th entry of the vector has jmap1[i+1]-jmap1[i] repeats
152: // in COO arrays
153: PetscCount *perm1_d; // [tot1]: permutation array for local entries
154: PetscCount *imap2_d; // [nnz2]: i-th unique entry in recvbuf is imap2[i]-th entry in
155: // the vector
156: PetscCount *jmap2_d; // [nnz2+1]
157: PetscCount *perm2_d; // [recvlen]
158: PetscCount *Cperm_d; // [sendlen]: permutation array to fill sendbuf[]. 'C' for
159: // communication
161: // Buffers for remote values in VecSetValuesCOO()
162: PetscScalar *sendbuf_d;
163: PetscScalar *recvbuf_d;
164: };
166: // Cast the Vec to its Vec_CUPM struct, i.e. return the result of (Vec_CUPM *)v->spptr
167: PETSC_NODISCARD static Vec_CUPM *VecCUPMCast(Vec) noexcept;
168: // Cast the Vec to its host struct, i.e. return the result of (Vec_Seq *)v->data
169: template <typename U = Derived>
170: PETSC_NODISCARD static constexpr auto VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v));
171: // Get the PetscLogEvents for HTOD and DTOH
172: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyToGPU() noexcept;
173: PETSC_NODISCARD static constexpr PetscLogEvent VEC_CUPMCopyFromGPU() noexcept;
174: // Get the VecTypes
175: PETSC_NODISCARD static constexpr VecType VECSEQCUPM() noexcept;
176: PETSC_NODISCARD static constexpr VecType VECMPICUPM() noexcept;
177: PETSC_NODISCARD static constexpr VecType VECCUPM() noexcept;
179: // Get the device VecType of the calling vector
180: template <typename U = Derived>
181: PETSC_NODISCARD static constexpr VecType VECIMPLCUPM() noexcept;
182: // Get the host VecType of the calling vector
183: template <typename U = Derived>
184: PETSC_NODISCARD static constexpr VecType VECIMPL() noexcept;
186: // Call the host destroy function, i.e. VecDestroy_Seq()
187: static PetscErrorCode VecDestroy_IMPL(Vec) noexcept;
188: // Call the host reset function, i.e. VecResetArray_Seq()
189: static PetscErrorCode VecResetArray_IMPL(Vec) noexcept;
190: // ... you get the idea
191: static PetscErrorCode VecPlaceArray_IMPL(Vec, const PetscScalar *) noexcept;
192: // Call the host creation function, i.e. VecCreate_Seq(), and also initialize the CUPM part
193: // along with it if needed
194: static PetscErrorCode VecCreate_IMPL_Private(Vec, PetscBool *, PetscInt = 0, PetscScalar * = nullptr) noexcept;
196: // Shorthand for creating VectorArray's. Need functions to create them, otherwise using them
197: // as an unnamed temporary leads to most vexing parse
198: PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, v});
199: PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
200: PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
201: PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, v});
202: PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, v});
203: PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Vec v) noexcept PETSC_DECLTYPE_AUTO_RETURNS(VectorArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, v});
205: // ops-table functions
206: static PetscErrorCode Create(Vec) noexcept;
207: static PetscErrorCode Destroy(Vec) noexcept;
208: template <PetscMemType, PetscMemoryAccessMode, bool = false>
209: static PetscErrorCode GetArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
210: template <PetscMemType, PetscMemoryAccessMode, bool = false>
211: static PetscErrorCode GetArray(Vec, PetscScalar **) noexcept;
212: template <PetscMemType, PetscMemoryAccessMode>
213: static PetscErrorCode RestoreArray(Vec, PetscScalar **, PetscDeviceContext) noexcept;
214: template <PetscMemType, PetscMemoryAccessMode>
215: static PetscErrorCode RestoreArray(Vec, PetscScalar **) noexcept;
216: template <PetscMemoryAccessMode>
217: static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
218: template <PetscMemoryAccessMode>
219: static PetscErrorCode GetArrayAndMemtype(Vec, PetscScalar **, PetscMemType *) noexcept;
220: template <PetscMemoryAccessMode>
221: static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **, PetscDeviceContext) noexcept;
222: template <PetscMemoryAccessMode>
223: static PetscErrorCode RestoreArrayAndMemtype(Vec, PetscScalar **) noexcept;
224: template <PetscMemType>
225: static PetscErrorCode ReplaceArray(Vec, const PetscScalar *) noexcept;
226: template <PetscMemType>
227: static PetscErrorCode ResetArray(Vec) noexcept;
228: template <PetscMemType>
229: static PetscErrorCode PlaceArray(Vec, const PetscScalar *) noexcept;
231: // common ops shared between Seq and MPI
232: static PetscErrorCode Create_CUPM(Vec) noexcept;
233: static PetscErrorCode Create_CUPMBase(MPI_Comm, PetscInt, PetscInt, PetscInt, Vec *, PetscBool, PetscLayout /*reference*/ = nullptr) noexcept;
234: static PetscErrorCode Initialize_CUPMBase(Vec, PetscBool, PetscScalar *, PetscScalar *, PetscDeviceContext) noexcept;
235: template <typename SetupFunctionT = no_op>
236: static PetscErrorCode Duplicate_CUPMBase(Vec, Vec *, PetscDeviceContext, SetupFunctionT && = SetupFunctionT{}) noexcept;
237: static PetscErrorCode BindToCPU_CUPMBase(Vec, PetscBool, PetscDeviceContext) noexcept;
238: static PetscErrorCode GetArrays_CUPMBase(Vec, const PetscScalar **, const PetscScalar **, PetscOffloadMask *, PetscDeviceContext) noexcept;
239: static PetscErrorCode ResetPreallocationCOO_CUPMBase(Vec, PetscDeviceContext) noexcept;
240: template <std::size_t NCount = 0, std::size_t NScal = 0>
241: static PetscErrorCode SetPreallocationCOO_CUPMBase(Vec, PetscCount, const PetscInt[], PetscDeviceContext, const std::array<CooPair<PetscCount>, NCount> & = {}, const std::array<CooPair<PetscScalar>, NScal> & = {}) noexcept;
243: static PetscErrorCode Convert_IMPL_IMPLCUPM(Vec) noexcept;
244: };
246: // ==========================================================================================
247: // Vec_CUPMBase::VectorArray
248: //
249: // RAII versions of the get/restore array routines. Determines constness of the pointer type,
250: // holds the pointer itself and provides the implicit conversion operator.
251: //
252: // On construction this calls the moral equivalent of Vec[CUPM]GetArray[Read|Write]()
253: // (depending on PetscMemoryAccessMode) and on destruction automatically restores the array
254: // for you
255: // ==========================================================================================
256: template <device::cupm::DeviceType T, typename D>
257: template <PetscMemType MT, PetscMemoryAccessMode MA>
258: class Vec_CUPMBase<T, D>::VectorArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
259: using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;
261: public:
262: VectorArray(PetscDeviceContext, Vec) noexcept;
263: ~VectorArray() noexcept;
265: private:
266: Vec v_ = nullptr;
267: };
269: // ==========================================================================================
270: // Vec_CUPMBase::VectorArray - Public API
271: // ==========================================================================================
273: template <device::cupm::DeviceType T, typename D>
274: template <PetscMemType MT, PetscMemoryAccessMode MA>
275: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::VectorArray(PetscDeviceContext dctx, Vec v) noexcept : base_type{dctx}, v_{v}
276: {
277: PetscFunctionBegin;
278: PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template GetArray<MT, MA, true>(v, &this->ptr_, dctx));
279: PetscFunctionReturnVoid();
280: }
282: template <device::cupm::DeviceType T, typename D>
283: template <PetscMemType MT, PetscMemoryAccessMode MA>
284: inline Vec_CUPMBase<T, D>::VectorArray<MT, MA>::~VectorArray() noexcept
285: {
286: PetscFunctionBegin;
287: PetscCallAbort(PETSC_COMM_SELF, Vec_CUPMBase<T, D>::template RestoreArray<MT, MA>(v_, &this->ptr_, this->dctx_));
288: PetscFunctionReturnVoid();
289: }
291: // ==========================================================================================
292: // Vec_CUPMBase - Protected API
293: // ==========================================================================================
295: template <device::cupm::DeviceType T, typename D>
296: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetAllocatedDevicePtr_(PetscDeviceContext dctx, Vec v, PetscScalar *new_value) noexcept
297: {
298: auto &device_array = VecCUPMCast(v)->array_allocated_d;
300: PetscFunctionBegin;
301: if (device_array) {
302: if (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) {
303: PetscCall(PetscNvshmemFree(device_array));
304: } else {
305: cupmStream_t stream;
307: PetscCall(GetHandlesFrom_(dctx, &stream));
308: PetscCallCUPM(cupmFreeAsync(device_array, stream));
309: }
310: }
311: device_array = new_value;
312: PetscFunctionReturn(PETSC_SUCCESS);
313: }
315: namespace
316: {
318: inline PetscErrorCode VecCUPMCheckMinimumPinnedMemory_Internal(Vec v, PetscBool *set = nullptr) noexcept
319: {
320: auto mem = static_cast<PetscInt>(v->minimum_bytes_pinned_memory);
321: PetscBool flg;
323: PetscFunctionBegin;
324: PetscObjectOptionsBegin(PetscObjectCast(v));
325: PetscCall(PetscOptionsRangeInt("-vec_pinned_memory_min", "Minimum size (in bytes) for an allocation to use pinned memory on host", "VecSetPinnedMemoryMin", mem, &mem, &flg, 0, std::numeric_limits<decltype(mem)>::max()));
326: if (flg) v->minimum_bytes_pinned_memory = mem;
327: PetscOptionsEnd();
328: if (set) *set = flg;
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
332: } // anonymous namespace
334: template <device::cupm::DeviceType T, typename D>
335: template <typename CastFunctionType>
336: inline PetscErrorCode Vec_CUPMBase<T, D>::VecAllocateCheck_(Vec v, void *&dest, CastFunctionType &&cast) noexcept
337: {
338: PetscFunctionBegin;
339: if (PetscLikely(dest)) PetscFunctionReturn(PETSC_SUCCESS);
340: // do the check here so we don't have to do it in every function
341: PetscCall(checkCupmBlasIntCast(v->map->n));
342: {
343: auto impl = cast(v);
345: PetscCall(PetscNew(&impl));
346: dest = impl;
347: }
348: PetscFunctionReturn(PETSC_SUCCESS);
349: }
351: template <device::cupm::DeviceType T, typename D>
352: inline PetscErrorCode Vec_CUPMBase<T, D>::VecIMPLAllocateCheck_(Vec v) noexcept
353: {
354: PetscFunctionBegin;
355: PetscCall(VecAllocateCheck_(v, v->data, VecIMPLCast<D>));
356: PetscFunctionReturn(PETSC_SUCCESS);
357: }
359: // allocate the Vec_CUPM struct. this is normally done through DeviceAllocateCheck_(), but in
360: // certain circumstances (such as when the user places the device array) we do not want to do
361: // the full DeviceAllocateCheck_() as it also allocates the array
362: template <device::cupm::DeviceType T, typename D>
363: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCUPMAllocateCheck_(Vec v) noexcept
364: {
365: PetscFunctionBegin;
366: PetscCall(VecAllocateCheck_(v, v->spptr, VecCUPMCast));
367: PetscFunctionReturn(PETSC_SUCCESS);
368: }
370: template <device::cupm::DeviceType T, typename D>
371: inline PetscErrorCode Vec_CUPMBase<T, D>::HostAllocateCheck_(PetscDeviceContext, Vec v) noexcept
372: {
373: PetscFunctionBegin;
374: PetscCall(VecIMPLAllocateCheck_(v));
375: if (auto &alloc = VecIMPLCast(v)->array_allocated) PetscFunctionReturn(PETSC_SUCCESS);
376: else {
377: PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v));
378: {
379: const auto n = v->map->n;
380: const auto useit = UseCUPMHostAlloc((n * sizeof(*alloc)) > v->minimum_bytes_pinned_memory);
382: v->pinned_memory = static_cast<decltype(v->pinned_memory)>(useit.value());
383: PetscCall(PetscMalloc1(n, &alloc));
384: }
385: if (!VecIMPLCast(v)->array) VecIMPLCast(v)->array = alloc;
386: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) v->offloadmask = PETSC_OFFLOAD_CPU;
387: }
388: PetscFunctionReturn(PETSC_SUCCESS);
389: }
391: template <device::cupm::DeviceType T, typename D>
392: inline PetscErrorCode Vec_CUPMBase<T, D>::DeviceAllocateCheck_(PetscDeviceContext dctx, Vec v) noexcept
393: {
394: PetscFunctionBegin;
395: PetscCall(VecCUPMAllocateCheck_(v));
396: if (auto &alloc = VecCUPMCast(v)->array_d) PetscFunctionReturn(PETSC_SUCCESS);
397: else {
398: const auto n = v->map->n;
399: auto &array_allocated_d = VecCUPMCast(v)->array_allocated_d;
400: cupmStream_t stream;
402: PetscCall(GetHandlesFrom_(dctx, &stream));
403: PetscCall(PetscCUPMMallocAsync(&array_allocated_d, n, stream));
404: alloc = array_allocated_d;
405: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
406: const auto vimp = VecIMPLCast(v);
407: v->offloadmask = (vimp && vimp->array) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
408: }
409: }
410: PetscFunctionReturn(PETSC_SUCCESS);
411: }
413: template <device::cupm::DeviceType T, typename D>
414: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToDevice_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
415: {
416: PetscFunctionBegin;
417: PetscCall(DeviceAllocateCheck_(dctx, v));
418: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
419: cupmStream_t stream;
421: v->offloadmask = PETSC_OFFLOAD_BOTH;
422: PetscCall(GetHandlesFrom_(dctx, &stream));
423: PetscCall(PetscLogEventBegin(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
424: PetscCall(PetscCUPMMemcpyAsync(VecCUPMCast(v)->array_d, VecIMPLCast(v)->array, v->map->n, cupmMemcpyHostToDevice, stream, forceasync));
425: PetscCall(PetscLogEventEnd(VEC_CUPMCopyToGPU(), v, 0, 0, 0));
426: }
427: PetscFunctionReturn(PETSC_SUCCESS);
428: }
430: template <device::cupm::DeviceType T, typename D>
431: inline PetscErrorCode Vec_CUPMBase<T, D>::CopyToHost_(PetscDeviceContext dctx, Vec v, bool forceasync) noexcept
432: {
433: PetscFunctionBegin;
434: PetscCall(HostAllocateCheck_(dctx, v));
435: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
436: cupmStream_t stream;
438: v->offloadmask = PETSC_OFFLOAD_BOTH;
439: PetscCall(GetHandlesFrom_(dctx, &stream));
440: PetscCall(PetscLogEventBegin(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
441: PetscCall(PetscCUPMMemcpyAsync(VecIMPLCast(v)->array, VecCUPMCast(v)->array_d, v->map->n, cupmMemcpyDeviceToHost, stream, forceasync));
442: PetscCall(PetscLogEventEnd(VEC_CUPMCopyFromGPU(), v, 0, 0, 0));
443: }
444: PetscFunctionReturn(PETSC_SUCCESS);
445: }
447: template <device::cupm::DeviceType T, typename D>
448: inline PetscErrorCode Vec_CUPMBase<T, D>::DestroyDevice_(Vec v) noexcept
449: {
450: PetscFunctionBegin;
451: if (const auto vcu = VecCUPMCast(v)) {
452: PetscDeviceContext dctx;
454: PetscCall(GetHandles_(&dctx));
455: PetscCall(ResetAllocatedDevicePtr_(dctx, v));
456: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
457: PetscCall(PetscFree(v->spptr));
458: }
459: PetscFunctionReturn(PETSC_SUCCESS);
460: }
462: template <device::cupm::DeviceType T, typename D>
463: inline PetscErrorCode Vec_CUPMBase<T, D>::DestroyHost_(Vec v) noexcept
464: {
465: PetscFunctionBegin;
466: PetscCall(PetscObjectSAWsViewOff(PetscObjectCast(v)));
467: if (const auto vimpl = VecIMPLCast(v)) {
468: if (auto &array_allocated = vimpl->array_allocated) {
469: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
471: // do this ourselves since we may want to use the cupm functions
472: PetscCall(PetscFree(array_allocated));
473: }
474: }
475: v->pinned_memory = PETSC_FALSE;
476: PetscCall(VecDestroy_IMPL(v));
477: PetscFunctionReturn(PETSC_SUCCESS);
478: }
480: // ==========================================================================================
481: // Vec_CUPMBase - Public API
482: // ==========================================================================================
484: template <device::cupm::DeviceType T, typename D>
485: inline typename Vec_CUPMBase<T, D>::Vec_CUPM *Vec_CUPMBase<T, D>::VecCUPMCast(Vec v) noexcept
486: {
487: return static_cast<Vec_CUPM *>(v->spptr);
488: }
490: // This is a trick to get around the fact that in CRTP the derived class is not yet fully
491: // defined because Base must necessarily be instantiated before Derived is
492: // complete. By using a dummy template parameter we make the type "dependent" and so will
493: // only be determined when the derived class is instantiated (and therefore fully defined)
494: template <device::cupm::DeviceType T, typename D>
495: template <typename U>
496: inline constexpr auto Vec_CUPMBase<T, D>::VecIMPLCast(Vec v) noexcept -> decltype(U::VecIMPLCast_(v))
497: {
498: return U::VecIMPLCast_(v);
499: }
501: template <device::cupm::DeviceType T, typename D>
502: inline PetscErrorCode Vec_CUPMBase<T, D>::VecDestroy_IMPL(Vec v) noexcept
503: {
504: return D::VecDestroy_IMPL_(v);
505: }
507: template <device::cupm::DeviceType T, typename D>
508: inline PetscErrorCode Vec_CUPMBase<T, D>::VecResetArray_IMPL(Vec v) noexcept
509: {
510: return D::VecResetArray_IMPL_(v);
511: }
513: template <device::cupm::DeviceType T, typename D>
514: inline PetscErrorCode Vec_CUPMBase<T, D>::VecPlaceArray_IMPL(Vec v, const PetscScalar *a) noexcept
515: {
516: return D::VecPlaceArray_IMPL_(v, a);
517: }
519: template <device::cupm::DeviceType T, typename D>
520: inline PetscErrorCode Vec_CUPMBase<T, D>::VecCreate_IMPL_Private(Vec v, PetscBool *alloc_missing, PetscInt nghost, PetscScalar *host_array) noexcept
521: {
522: return D::VecCreate_IMPL_Private_(v, alloc_missing, nghost, host_array);
523: }
525: template <device::cupm::DeviceType T, typename D>
526: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyToGPU() noexcept
527: {
528: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyToGPU : VEC_HIPCopyToGPU;
529: }
531: template <device::cupm::DeviceType T, typename D>
532: inline constexpr PetscLogEvent Vec_CUPMBase<T, D>::VEC_CUPMCopyFromGPU() noexcept
533: {
534: return T == device::cupm::DeviceType::CUDA ? VEC_CUDACopyFromGPU : VEC_HIPCopyFromGPU;
535: }
537: template <device::cupm::DeviceType T, typename D>
538: inline constexpr VecType Vec_CUPMBase<T, D>::VECSEQCUPM() noexcept
539: {
540: return T == device::cupm::DeviceType::CUDA ? VECSEQCUDA : VECSEQHIP;
541: }
543: template <device::cupm::DeviceType T, typename D>
544: inline constexpr VecType Vec_CUPMBase<T, D>::VECMPICUPM() noexcept
545: {
546: return T == device::cupm::DeviceType::CUDA ? VECMPICUDA : VECMPIHIP;
547: }
549: template <device::cupm::DeviceType T, typename D>
550: inline constexpr VecType Vec_CUPMBase<T, D>::VECCUPM() noexcept
551: {
552: return T == device::cupm::DeviceType::CUDA ? VECCUDA : VECHIP;
553: }
555: template <device::cupm::DeviceType T, typename D>
556: template <typename U>
557: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPLCUPM() noexcept
558: {
559: return U::VECIMPLCUPM_();
560: }
562: template <device::cupm::DeviceType T, typename D>
563: template <typename U>
564: inline constexpr VecType Vec_CUPMBase<T, D>::VECIMPL() noexcept
565: {
566: return U::VECIMPL_();
567: }
569: // private version that takes a PetscDeviceContext, called by the public variant
570: template <device::cupm::DeviceType T, typename D>
571: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
572: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
573: {
574: constexpr auto hostmem = PetscMemTypeHost(mtype);
575: const auto oldmask = v->offloadmask;
576: auto &mask = v->offloadmask;
577: auto should_sync = false;
579: PetscFunctionBegin;
580: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
581: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
582: if (PetscMemoryAccessRead(access)) {
583: // READ or READ_WRITE
584: if (((oldmask == PETSC_OFFLOAD_GPU) && hostmem) || ((oldmask == PETSC_OFFLOAD_CPU) && !hostmem)) {
585: // if we move the data we should set the flag to synchronize later on
586: should_sync = true;
587: }
588: PetscCall((hostmem ? CopyToHost_ : CopyToDevice_)(dctx, v, force));
589: } else {
590: // WRITE only
591: PetscCall((hostmem ? HostAllocateCheck_ : DeviceAllocateCheck_)(dctx, v));
592: }
593: *a = hostmem ? VecIMPLCast(v)->array : VecCUPMCast(v)->array_d;
594: // if unallocated previously we should zero things out if we intend to read
595: if (PetscMemoryAccessRead(access) && (oldmask == PETSC_OFFLOAD_UNALLOCATED)) {
596: const auto n = v->map->n;
598: if (hostmem) {
599: PetscCall(PetscArrayzero(*a, n));
600: } else {
601: cupmStream_t stream;
603: PetscCall(GetHandlesFrom_(dctx, &stream));
604: PetscCall(PetscCUPMMemsetAsync(*a, 0, n, stream, force));
605: should_sync = true;
606: }
607: }
608: // update the offloadmask if we intend to write, since we assume immediately modified
609: if (PetscMemoryAccessWrite(access)) {
610: PetscCall(VecSetErrorIfLocked(v, 1));
611: // REVIEW ME: this should probably also call PetscObjectStateIncrease() since we assume it
612: // is immediately modified
613: mask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
614: }
615: // if we are a globally blocking stream and we have MOVED data then we should synchronize,
616: // since even doing async calls on the NULL stream is not synchronous
617: if (!force && should_sync) PetscCall(PetscDeviceContextSynchronize(dctx));
618: PetscFunctionReturn(PETSC_SUCCESS);
619: }
621: // v->ops->getarray[read|write] or VecCUPMGetArray[Read|Write]()
622: template <device::cupm::DeviceType T, typename D>
623: template <PetscMemType mtype, PetscMemoryAccessMode access, bool force>
624: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArray(Vec v, PetscScalar **a) noexcept
625: {
626: PetscDeviceContext dctx;
628: PetscFunctionBegin;
629: PetscCall(GetHandles_(&dctx));
630: PetscCall(D::template GetArray<mtype, access, force>(v, a, dctx));
631: PetscFunctionReturn(PETSC_SUCCESS);
632: }
634: // private version that takes a PetscDeviceContext, called by the public variant
635: template <device::cupm::DeviceType T, typename D>
636: template <PetscMemType mtype, PetscMemoryAccessMode access>
637: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a, PetscDeviceContext) noexcept
638: {
639: PetscFunctionBegin;
640: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
641: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
642: if (PetscMemoryAccessWrite(access)) {
643: // WRITE or READ_WRITE
644: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
645: v->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
646: }
647: if (a) {
648: PetscCall(CheckPointerMatchesMemType_(*a, mtype));
649: *a = nullptr;
650: }
651: PetscFunctionReturn(PETSC_SUCCESS);
652: }
654: // v->ops->restorearray[read|write] or VecCUPMRestoreArray[Read|Write]()
655: template <device::cupm::DeviceType T, typename D>
656: template <PetscMemType mtype, PetscMemoryAccessMode access>
657: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArray(Vec v, PetscScalar **a) noexcept
658: {
659: PetscDeviceContext dctx;
661: PetscFunctionBegin;
662: PetscCall(GetHandles_(&dctx));
663: PetscCall(D::template RestoreArray<mtype, access>(v, a, dctx));
664: PetscFunctionReturn(PETSC_SUCCESS);
665: }
667: template <device::cupm::DeviceType T, typename D>
668: template <PetscMemoryAccessMode access>
669: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
670: {
671: PetscFunctionBegin;
672: PetscCall(D::template GetArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
673: if (mtype) *mtype = (PetscDefined(HAVE_NVSHMEM) && VecCUPMCast(v)->nvshmem) ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUPM();
674: PetscFunctionReturn(PETSC_SUCCESS);
675: }
677: // v->ops->getarrayandmemtype
678: template <device::cupm::DeviceType T, typename D>
679: template <PetscMemoryAccessMode access>
680: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrayAndMemtype(Vec v, PetscScalar **a, PetscMemType *mtype) noexcept
681: {
682: PetscDeviceContext dctx;
684: PetscFunctionBegin;
685: PetscCall(GetHandles_(&dctx));
686: PetscCall(D::template GetArrayAndMemtype<access>(v, a, mtype, dctx));
687: PetscFunctionReturn(PETSC_SUCCESS);
688: }
690: template <device::cupm::DeviceType T, typename D>
691: template <PetscMemoryAccessMode access>
692: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a, PetscDeviceContext dctx) noexcept
693: {
694: PetscFunctionBegin;
695: PetscCall(D::template RestoreArray<PETSC_MEMTYPE_DEVICE, access>(v, a, dctx));
696: PetscFunctionReturn(PETSC_SUCCESS);
697: }
699: // v->ops->restorearrayandmemtype
700: template <device::cupm::DeviceType T, typename D>
701: template <PetscMemoryAccessMode access>
702: inline PetscErrorCode Vec_CUPMBase<T, D>::RestoreArrayAndMemtype(Vec v, PetscScalar **a) noexcept
703: {
704: PetscDeviceContext dctx;
706: PetscFunctionBegin;
707: PetscCall(GetHandles_(&dctx));
708: PetscCall(D::template RestoreArrayAndMemtype<access>(v, a, dctx));
709: PetscFunctionReturn(PETSC_SUCCESS);
710: }
712: // v->ops->placearray or VecCUPMPlaceArray()
713: template <device::cupm::DeviceType T, typename D>
714: template <PetscMemType mtype>
715: inline PetscErrorCode Vec_CUPMBase<T, D>::PlaceArray(Vec v, const PetscScalar *a) noexcept
716: {
717: PetscDeviceContext dctx;
719: PetscFunctionBegin;
720: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
721: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
722: PetscCall(CheckPointerMatchesMemType_(a, mtype));
723: PetscCall(GetHandles_(&dctx));
724: if (PetscMemTypeHost(mtype)) {
725: PetscCall(CopyToHost_(dctx, v));
726: PetscCall(VecPlaceArray_IMPL(v, a));
727: v->offloadmask = PETSC_OFFLOAD_CPU;
728: } else {
729: PetscCall(VecIMPLAllocateCheck_(v));
730: {
731: auto &backup_array = VecIMPLCast(v)->unplacedarray;
733: PetscCheck(!backup_array, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "VecPlaceArray() was already called on this vector, without a call to VecResetArray()");
734: PetscCall(CopyToDevice_(dctx, v));
735: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
736: backup_array = util::exchange(VecCUPMCast(v)->array_d, const_cast<PetscScalar *>(a));
737: // only update the offload mask if we actually assign a pointer
738: if (a) v->offloadmask = PETSC_OFFLOAD_GPU;
739: }
740: }
741: PetscFunctionReturn(PETSC_SUCCESS);
742: }
744: // v->ops->replacearray or VecCUPMReplaceArray()
745: template <device::cupm::DeviceType T, typename D>
746: template <PetscMemType mtype>
747: inline PetscErrorCode Vec_CUPMBase<T, D>::ReplaceArray(Vec v, const PetscScalar *a) noexcept
748: {
749: const auto aptr = const_cast<PetscScalar *>(a);
750: PetscDeviceContext dctx;
752: PetscFunctionBegin;
753: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
754: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
755: PetscCall(CheckPointerMatchesMemType_(a, mtype));
756: PetscCall(GetHandles_(&dctx));
757: if (PetscMemTypeHost(mtype)) {
758: PetscCall(VecIMPLAllocateCheck_(v));
759: {
760: const auto vimpl = VecIMPLCast(v);
761: auto &host_array = vimpl->array_allocated;
763: // make sure the users array has the latest values.
764: // REVIEW ME: why? we're about to free it
765: if (host_array != vimpl->array) PetscCall(CopyToHost_(dctx, v));
766: if (host_array) {
767: const auto useit = UseCUPMHostAlloc(v->pinned_memory);
769: PetscCall(PetscFree(host_array));
770: }
771: host_array = aptr;
772: vimpl->array = host_array;
773: v->pinned_memory = PETSC_FALSE; // REVIEW ME: we can determine this
774: v->offloadmask = PETSC_OFFLOAD_CPU;
775: }
776: } else {
777: PetscCall(VecCUPMAllocateCheck_(v));
778: {
779: const auto vcu = VecCUPMCast(v);
781: PetscCall(ResetAllocatedDevicePtr_(dctx, v, aptr));
782: // don't update the offloadmask if placed pointer is NULL
783: vcu->array_d = vcu->array_allocated_d /* = aptr */;
784: if (aptr) v->offloadmask = PETSC_OFFLOAD_GPU;
785: }
786: }
787: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
788: PetscFunctionReturn(PETSC_SUCCESS);
789: }
791: // v->ops->resetarray or VecCUPMResetArray()
792: template <device::cupm::DeviceType T, typename D>
793: template <PetscMemType mtype>
794: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetArray(Vec v) noexcept
795: {
796: PetscDeviceContext dctx;
798: PetscFunctionBegin;
799: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
800: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
801: PetscCall(GetHandles_(&dctx));
802: // REVIEW ME:
803: // this is wildly inefficient but must be done if we assume that the placed array must have
804: // correct values
805: if (PetscMemTypeHost(mtype)) {
806: PetscCall(CopyToHost_(dctx, v));
807: PetscCall(VecResetArray_IMPL(v));
808: v->offloadmask = PETSC_OFFLOAD_CPU;
809: } else {
810: PetscCall(VecIMPLAllocateCheck_(v));
811: PetscCall(VecCUPMAllocateCheck_(v));
812: {
813: const auto vcu = VecCUPMCast(v);
814: const auto vimpl = VecIMPLCast(v);
815: auto &host_array = vimpl->unplacedarray;
817: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_DEVICE));
818: PetscCall(CopyToDevice_(dctx, v));
819: PetscCall(PetscDeviceContextSynchronize(dctx)); // Above H2D might be async, so we must sync dctx, otherwise if later user writes v's host array, it could ruin the H2D
820: PetscCall(PetscObjectStateIncrease(PetscObjectCast(v)));
821: // Need to reset the offloadmask. If we had a stashed pointer we are on the GPU,
822: // otherwise check if the host has a valid pointer. If neither, then we are not
823: // allocated.
824: vcu->array_d = host_array;
825: if (host_array) {
826: host_array = nullptr;
827: v->offloadmask = PETSC_OFFLOAD_GPU;
828: } else if (vimpl->array) {
829: v->offloadmask = PETSC_OFFLOAD_CPU;
830: } else {
831: v->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
832: }
833: }
834: }
835: PetscFunctionReturn(PETSC_SUCCESS);
836: }
838: // v->ops->create
839: template <device::cupm::DeviceType T, typename D>
840: inline PetscErrorCode Vec_CUPMBase<T, D>::Create(Vec v) noexcept
841: {
842: PetscBool alloc_missing;
843: PetscDeviceContext dctx;
845: PetscFunctionBegin;
846: PetscCall(VecCreate_IMPL_Private(v, &alloc_missing));
847: PetscCall(GetHandles_(&dctx));
848: PetscCall(Initialize_CUPMBase(v, alloc_missing, nullptr, nullptr, dctx));
849: PetscFunctionReturn(PETSC_SUCCESS);
850: }
852: // v->ops->destroy
853: template <device::cupm::DeviceType T, typename D>
854: inline PetscErrorCode Vec_CUPMBase<T, D>::Destroy(Vec v) noexcept
855: {
856: PetscFunctionBegin;
857: PetscCall(DestroyDevice_(v));
858: PetscCall(DestroyHost_(v));
859: PetscFunctionReturn(PETSC_SUCCESS);
860: }
862: // ================================================================================== //
863: // Common core between Seq and MPI //
865: // VecCreate_CUPM()
866: template <device::cupm::DeviceType T, typename D>
867: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPM(Vec v) noexcept
868: {
869: PetscMPIInt size;
871: PetscFunctionBegin;
872: PetscCallMPI(MPI_Comm_size(PetscObjectComm(PetscObjectCast(v)), &size));
873: PetscCall(VecSetType(v, size > 1 ? VECMPICUPM() : VECSEQCUPM()));
874: PetscFunctionReturn(PETSC_SUCCESS);
875: }
877: // VecCreateCUPM()
878: template <device::cupm::DeviceType T, typename D>
879: inline PetscErrorCode Vec_CUPMBase<T, D>::Create_CUPMBase(MPI_Comm comm, PetscInt bs, PetscInt n, PetscInt N, Vec *v, PetscBool call_set_type, PetscLayout reference) noexcept
880: {
881: PetscFunctionBegin;
882: PetscCall(VecCreate(comm, v));
883: if (reference) PetscCall(PetscLayoutReference(reference, &(*v)->map));
884: PetscCall(VecSetSizes(*v, n, N));
885: if (bs) PetscCall(VecSetBlockSize(*v, bs));
886: if (call_set_type) PetscCall(VecSetType(*v, VECIMPLCUPM()));
887: PetscFunctionReturn(PETSC_SUCCESS);
888: }
890: // VecCreateIMPL_CUPM(), called through v->ops->create
891: template <device::cupm::DeviceType T, typename D>
892: inline PetscErrorCode Vec_CUPMBase<T, D>::Initialize_CUPMBase(Vec v, PetscBool allocate_missing, PetscScalar *host_array, PetscScalar *device_array, PetscDeviceContext dctx) noexcept
893: {
894: PetscFunctionBegin;
895: // REVIEW ME: perhaps not needed
896: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
897: PetscCall(PetscObjectChangeTypeName(PetscObjectCast(v), VECIMPLCUPM()));
898: PetscCall(D::BindToCPU(v, PETSC_FALSE));
899: if (device_array) {
900: PetscCall(CheckPointerMatchesMemType_(device_array, PETSC_MEMTYPE_CUPM()));
901: PetscCall(VecCUPMAllocateCheck_(v));
902: VecCUPMCast(v)->array_d = device_array;
903: }
904: if (host_array) {
905: PetscCall(CheckPointerMatchesMemType_(host_array, PETSC_MEMTYPE_HOST));
906: VecIMPLCast(v)->array = host_array;
907: }
908: if (allocate_missing) {
909: PetscCall(DeviceAllocateCheck_(dctx, v));
910: PetscCall(HostAllocateCheck_(dctx, v));
911: // REVIEW ME: junchao, is this needed with new calloc() branch? VecSet() will call
912: // set() for reference
913: // calls device-version
914: PetscCall(VecSet(v, 0));
915: // zero the host while device is underway
916: PetscCall(PetscArrayzero(VecIMPLCast(v)->array, v->map->n));
917: v->offloadmask = PETSC_OFFLOAD_BOTH;
918: } else {
919: if (host_array) {
920: v->offloadmask = device_array ? PETSC_OFFLOAD_BOTH : PETSC_OFFLOAD_CPU;
921: } else {
922: v->offloadmask = device_array ? PETSC_OFFLOAD_GPU : PETSC_OFFLOAD_UNALLOCATED;
923: }
924: }
925: PetscFunctionReturn(PETSC_SUCCESS);
926: }
928: // v->ops->duplicate
929: template <device::cupm::DeviceType T, typename D>
930: template <typename SetupFunctionT>
931: inline PetscErrorCode Vec_CUPMBase<T, D>::Duplicate_CUPMBase(Vec v, Vec *y, PetscDeviceContext dctx, SetupFunctionT &&DerivedCreateIMPLCUPM_Async) noexcept
932: {
933: // if the derived setup is the default no_op then we should call VecSetType()
934: constexpr auto call_set_type = static_cast<PetscBool>(std::is_same<SetupFunctionT, no_op>::value);
935: const auto vobj = PetscObjectCast(v);
936: const auto map = v->map;
937: PetscInt bs;
939: PetscFunctionBegin;
940: PetscCall(VecGetBlockSize(v, &bs));
941: PetscCall(Create_CUPMBase(PetscObjectComm(vobj), bs, map->n, map->N, y, call_set_type, map));
942: // Derived class can set up the remainder of the data structures here
943: PetscCall(DerivedCreateIMPLCUPM_Async(*y));
944: // If the other vector is bound to CPU then the memcpy of the ops struct will give the
945: // duplicated vector the host "getarray" function which does not lazily allocate the array
946: // (as it is assumed to always exist). So we force allocation here, before we overwrite the
947: // ops
948: if (v->boundtocpu) PetscCall(HostAllocateCheck_(dctx, *y));
949: // in case the user has done some VecSetOps() tomfoolery
950: (*y)->ops[0] = v->ops[0];
951: {
952: const auto yobj = PetscObjectCast(*y);
954: PetscCall(PetscObjectListDuplicate(vobj->olist, &yobj->olist));
955: PetscCall(PetscFunctionListDuplicate(vobj->qlist, &yobj->qlist));
956: }
957: (*y)->stash.donotstash = v->stash.donotstash;
958: (*y)->stash.ignorenegidx = v->stash.ignorenegidx;
959: (*y)->map->bs = std::abs(v->map->bs);
960: (*y)->bstash.bs = v->bstash.bs;
961: PetscFunctionReturn(PETSC_SUCCESS);
962: }
964: #define VecSetOp_CUPM(op_name, op_host, ...) \
965: do { \
966: if (usehost) { \
967: v->ops->op_name = op_host; \
968: } else { \
969: v->ops->op_name = __VA_ARGS__; \
970: } \
971: } while (0)
973: // v->ops->bindtocpu
974: template <device::cupm::DeviceType T, typename D>
975: inline PetscErrorCode Vec_CUPMBase<T, D>::BindToCPU_CUPMBase(Vec v, PetscBool usehost, PetscDeviceContext dctx) noexcept
976: {
977: PetscFunctionBegin;
978: v->boundtocpu = usehost;
979: if (usehost) PetscCall(CopyToHost_(dctx, v));
980: PetscCall(PetscStrFreeAllocpy(usehost ? PETSCRANDER48 : PETSCDEVICERAND(), &v->defaultrandtype));
982: // set the base functions that are guaranteed to be the same for both
983: v->ops->duplicate = D::Duplicate;
984: v->ops->create = D::Create;
985: v->ops->destroy = D::Destroy;
986: v->ops->bindtocpu = D::BindToCPU;
987: // Note that setting these to NULL on host breaks convergence in certain areas. I don't know
988: // why, and I don't know how, but it is IMPERATIVE these are set as such!
989: v->ops->replacearray = D::template ReplaceArray<PETSC_MEMTYPE_HOST>;
990: v->ops->restorearray = D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>;
992: // set device-only common functions
993: VecSetOp_CUPM(getarray, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
994: VecSetOp_CUPM(getarraywrite, nullptr, D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
995: VecSetOp_CUPM(restorearraywrite, nullptr, D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
997: VecSetOp_CUPM(getarrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template GetArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
998: VecSetOp_CUPM(restorearrayread, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1000: VecSetOp_CUPM(getarrayandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1001: VecSetOp_CUPM(restorearrayandmemtype, nullptr, D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ_WRITE>);
1003: VecSetOp_CUPM(getarraywriteandmemtype, nullptr, D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>);
1004: VecSetOp_CUPM(restorearraywriteandmemtype, nullptr, [](Vec v, PetscScalar **a, PetscMemType *) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_WRITE>(v, a); });
1006: VecSetOp_CUPM(getarrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a, PetscMemType *m) { return D::template GetArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a), m); });
1007: VecSetOp_CUPM(restorearrayreadandmemtype, nullptr, [](Vec v, const PetscScalar **a) { return D::template RestoreArrayAndMemtype<PETSC_MEMORY_ACCESS_READ>(v, const_cast<PetscScalar **>(a)); });
1009: // set the functions that are always sequential
1010: using VecSeq_T = VecSeq_CUPM<T>;
1011: VecSetOp_CUPM(scale, VecScale_Seq, VecSeq_T::Scale);
1012: VecSetOp_CUPM(copy, VecCopy_Seq, VecSeq_T::Copy);
1013: VecSetOp_CUPM(set, VecSet_Seq, VecSeq_T::Set);
1014: VecSetOp_CUPM(swap, VecSwap_Seq, VecSeq_T::Swap);
1015: VecSetOp_CUPM(axpy, VecAXPY_Seq, VecSeq_T::AXPY);
1016: VecSetOp_CUPM(axpby, VecAXPBY_Seq, VecSeq_T::AXPBY);
1017: VecSetOp_CUPM(maxpy, VecMAXPY_Seq, VecSeq_T::MAXPY);
1018: VecSetOp_CUPM(aypx, VecAYPX_Seq, VecSeq_T::AYPX);
1019: VecSetOp_CUPM(waxpy, VecWAXPY_Seq, VecSeq_T::WAXPY);
1020: VecSetOp_CUPM(axpbypcz, VecAXPBYPCZ_Seq, VecSeq_T::AXPBYPCZ);
1021: VecSetOp_CUPM(pointwisemult, VecPointwiseMult_Seq, VecSeq_T::PointwiseMult);
1022: VecSetOp_CUPM(pointwisedivide, VecPointwiseDivide_Seq, VecSeq_T::PointwiseDivide);
1023: VecSetOp_CUPM(pointwisemax, VecPointwiseMax_Seq, VecSeq_T::PointwiseMax);
1024: VecSetOp_CUPM(pointwisemaxabs, VecPointwiseMaxAbs_Seq, VecSeq_T::PointwiseMaxAbs);
1025: VecSetOp_CUPM(pointwisemin, VecPointwiseMin_Seq, VecSeq_T::PointwiseMin);
1026: VecSetOp_CUPM(setrandom, VecSetRandom_Seq, VecSeq_T::SetRandom);
1027: VecSetOp_CUPM(dot_local, VecDot_Seq, VecSeq_T::Dot);
1028: VecSetOp_CUPM(tdot_local, VecTDot_Seq, VecSeq_T::TDot);
1029: VecSetOp_CUPM(norm_local, VecNorm_Seq, VecSeq_T::Norm);
1030: VecSetOp_CUPM(mdot_local, VecMDot_Seq, VecSeq_T::MDot);
1031: VecSetOp_CUPM(reciprocal, VecReciprocal_Default, VecSeq_T::Reciprocal);
1032: VecSetOp_CUPM(conjugate, VecConjugate_Seq, VecSeq_T::Conjugate);
1033: VecSetOp_CUPM(abs, nullptr, VecSeq_T::Abs);
1034: VecSetOp_CUPM(sqrt, nullptr, VecSeq_T::SqrtAbs);
1035: VecSetOp_CUPM(exp, nullptr, VecSeq_T::Exp);
1036: VecSetOp_CUPM(log, nullptr, VecSeq_T::Log);
1037: VecSetOp_CUPM(shift, nullptr, VecSeq_T::Shift);
1038: VecSetOp_CUPM(dotnorm2, nullptr, D::DotNorm2);
1039: VecSetOp_CUPM(getlocalvector, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1040: VecSetOp_CUPM(restorelocalvector, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ_WRITE>);
1041: VecSetOp_CUPM(getlocalvectorread, nullptr, VecSeq_T::template GetLocalVector<PETSC_MEMORY_ACCESS_READ>);
1042: VecSetOp_CUPM(restorelocalvectorread, nullptr, VecSeq_T::template RestoreLocalVector<PETSC_MEMORY_ACCESS_READ>);
1043: VecSetOp_CUPM(sum, nullptr, VecSeq_T::Sum);
1044: VecSetOp_CUPM(errorwnorm, nullptr, D::ErrorWnorm);
1045: VecSetOp_CUPM(duplicatevecs, VecDuplicateVecs_Default, VecDuplicateVecs_Default);
1046: PetscFunctionReturn(PETSC_SUCCESS);
1047: }
1049: // Called from VecGetSubVector()
1050: template <device::cupm::DeviceType T, typename D>
1051: inline PetscErrorCode Vec_CUPMBase<T, D>::GetArrays_CUPMBase(Vec v, const PetscScalar **host_array, const PetscScalar **device_array, PetscOffloadMask *mask, PetscDeviceContext dctx) noexcept
1052: {
1053: PetscFunctionBegin;
1054: PetscCheckTypeNames(v, VECSEQCUPM(), VECMPICUPM());
1055: if (host_array) {
1056: PetscCall(HostAllocateCheck_(dctx, v));
1057: *host_array = VecIMPLCast(v)->array;
1058: }
1059: if (device_array) {
1060: PetscCall(DeviceAllocateCheck_(dctx, v));
1061: *device_array = VecCUPMCast(v)->array_d;
1062: }
1063: if (mask) *mask = v->offloadmask;
1064: PetscFunctionReturn(PETSC_SUCCESS);
1065: }
1067: template <device::cupm::DeviceType T, typename D>
1068: inline PetscErrorCode Vec_CUPMBase<T, D>::ResetPreallocationCOO_CUPMBase(Vec v, PetscDeviceContext dctx) noexcept
1069: {
1070: PetscFunctionBegin;
1071: if (const auto vcu = VecCUPMCast(v)) {
1072: cupmStream_t stream;
1073: // clang-format off
1074: const auto cntptrs = util::make_array(
1075: std::ref(vcu->jmap1_d),
1076: std::ref(vcu->perm1_d),
1077: std::ref(vcu->imap2_d),
1078: std::ref(vcu->jmap2_d),
1079: std::ref(vcu->perm2_d),
1080: std::ref(vcu->Cperm_d)
1081: );
1082: // clang-format on
1084: PetscCall(GetHandlesFrom_(dctx, &stream));
1085: for (auto &&ptr : cntptrs) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1086: for (auto &&ptr : util::make_array(std::ref(vcu->sendbuf_d), std::ref(vcu->recvbuf_d))) PetscCallCUPM(cupmFreeAsync(ptr.get(), stream));
1087: }
1088: PetscFunctionReturn(PETSC_SUCCESS);
1089: }
1091: template <device::cupm::DeviceType T, typename D>
1092: template <std::size_t NCount, std::size_t NScal>
1093: inline PetscErrorCode Vec_CUPMBase<T, D>::SetPreallocationCOO_CUPMBase(Vec v, PetscCount, const PetscInt[], PetscDeviceContext dctx, const std::array<CooPair<PetscCount>, NCount> &extra_cntptrs, const std::array<CooPair<PetscScalar>, NScal> &bufptrs) noexcept
1094: {
1095: PetscFunctionBegin;
1096: PetscCall(ResetPreallocationCOO_CUPMBase(v, dctx));
1097: // need to instantiate the private pointer if not already
1098: PetscCall(VecCUPMAllocateCheck_(v));
1099: {
1100: const auto vimpl = VecIMPLCast(v);
1101: const auto vcu = VecCUPMCast(v);
1102: // clang-format off
1103: const auto cntptrs = util::concat_array(
1104: util::make_array(
1105: make_coo_pair(vcu->jmap1_d, vimpl->jmap1, v->map->n + 1),
1106: make_coo_pair(vcu->perm1_d, vimpl->perm1, vimpl->tot1)
1107: ),
1108: extra_cntptrs
1109: );
1110: // clang-format on
1111: cupmStream_t stream;
1113: PetscCall(GetHandlesFrom_(dctx, &stream));
1114: // allocate
1115: for (auto &elem : cntptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1116: for (auto &elem : bufptrs) PetscCall(PetscCUPMMallocAsync(&elem.device, elem.size, stream));
1117: // copy
1118: for (const auto &elem : cntptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1119: for (const auto &elem : bufptrs) PetscCall(PetscCUPMMemcpyAsync(elem.device, elem.host, elem.size, cupmMemcpyHostToDevice, stream, true));
1120: }
1121: PetscFunctionReturn(PETSC_SUCCESS);
1122: }
1124: template <device::cupm::DeviceType T, typename D>
1125: inline PetscErrorCode Vec_CUPMBase<T, D>::Convert_IMPL_IMPLCUPM(Vec v) noexcept
1126: {
1127: const auto n = v->map->n;
1128: const auto vimpl = VecIMPLCast(v);
1129: auto &impl_arr = vimpl->array;
1130: PetscBool set = PETSC_FALSE;
1131: PetscDeviceContext dctx;
1133: PetscFunctionBegin;
1134: // If users do not explicitly require pinned memory, we prefer keeping the vector's regular
1135: // host array
1136: PetscCall(VecCUPMCheckMinimumPinnedMemory_Internal(v, &set));
1137: if (set && impl_arr && ((n * sizeof(*impl_arr)) > v->minimum_bytes_pinned_memory)) {
1138: auto &impl_alloc = vimpl->array_allocated;
1139: PetscScalar *new_arr;
1141: // users require pinned memory
1142: {
1143: // Allocate pinned memory and copy over the old array
1144: const auto useit = UseCUPMHostAlloc(PETSC_TRUE);
1146: PetscCall(PetscMalloc1(n, &new_arr));
1147: PetscCall(PetscArraycpy(new_arr, impl_arr, n));
1148: }
1149: PetscCall(PetscFree(impl_alloc));
1150: impl_arr = new_arr;
1151: impl_alloc = new_arr;
1152: v->offloadmask = PETSC_OFFLOAD_CPU;
1153: v->pinned_memory = PETSC_TRUE;
1154: }
1155: PetscCall(GetHandles_(&dctx));
1156: PetscCall(Initialize_CUPMBase(v, PETSC_FALSE, impl_arr, nullptr, dctx));
1157: PetscFunctionReturn(PETSC_SUCCESS);
1158: }
1160: #define PETSC_VEC_CUPM_BASE_CLASS_HEADER(name, Tp, ...) \
1161: PETSC_CUPMOBJECT_HEADER(Tp); \
1162: using name = ::Petsc::vec::cupm::impl::Vec_CUPMBase<Tp, __VA_ARGS__>; \
1163: friend name; \
1164: /* introspection */ \
1165: using name::VecCUPMCast; \
1166: using name::VecIMPLCast; \
1167: using name::VECIMPLCUPM; \
1168: using name::VECIMPL; \
1169: using name::VECSEQCUPM; \
1170: using name::VECMPICUPM; \
1171: using name::VECCUPM; \
1172: using name::VecView_Debug; \
1173: /* utility */ \
1174: using typename name::Vec_CUPM; \
1175: using name::VecCUPMAllocateCheck_; \
1176: using name::VecIMPLAllocateCheck_; \
1177: using name::HostAllocateCheck_; \
1178: using name::DeviceAllocateCheck_; \
1179: using name::CopyToDevice_; \
1180: using name::CopyToHost_; \
1181: using name::Create; \
1182: using name::Destroy; \
1183: using name::GetArray; \
1184: using name::RestoreArray; \
1185: using name::GetArrayAndMemtype; \
1186: using name::RestoreArrayAndMemtype; \
1187: using name::PlaceArray; \
1188: using name::ReplaceArray; \
1189: using name::ResetArray; \
1190: /* base functions */ \
1191: using name::Create_CUPMBase; \
1192: using name::Initialize_CUPMBase; \
1193: using name::Duplicate_CUPMBase; \
1194: using name::BindToCPU_CUPMBase; \
1195: using name::Create_CUPM; \
1196: using name::DeviceArrayRead; \
1197: using name::DeviceArrayWrite; \
1198: using name::DeviceArrayReadWrite; \
1199: using name::HostArrayRead; \
1200: using name::HostArrayWrite; \
1201: using name::HostArrayReadWrite; \
1202: using name::ResetPreallocationCOO_CUPMBase; \
1203: using name::SetPreallocationCOO_CUPMBase; \
1204: using name::Convert_IMPL_IMPLCUPM;
1206: } // namespace impl
1208: } // namespace cupm
1210: } // namespace vec
1212: } // namespace Petsc
1214: #endif // __cplusplus && PetscDefined(HAVE_DEVICE)