Actual source code: matdensecupmimpl.h

  1: #pragma once

  3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  4: #include <petsc/private/matimpl.h>

  6: #ifdef __cplusplus
  7: #include <petsc/private/deviceimpl.h>
  8: #include <petsc/private/cupmsolverinterface.hpp>
  9: #include <petsc/private/cupmobject.hpp>

 11:   #include "../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp"
 12:   #include "../src/sys/objects/device/impls/cupm/kernels.hpp"

 14:   #include <thrust/device_vector.h>
 15:   #include <thrust/device_ptr.h>
 16:   #include <thrust/iterator/counting_iterator.h>
 17:   #include <thrust/iterator/transform_iterator.h>
 18:   #include <thrust/iterator/permutation_iterator.h>
 19:   #include <thrust/transform.h>
 20:   #include <thrust/copy.h>

 22: namespace Petsc
 23: {

 25: namespace vec
 26: {

 28: namespace cupm
 29: {

 31: namespace impl
 32: {

 34: template <device::cupm::DeviceType>
 35: class VecSeq_CUPM;
 36: template <device::cupm::DeviceType>
 37: class VecMPI_CUPM;

 39: } // namespace impl

 41: } // namespace cupm

 43: } // namespace vec

 45: namespace mat
 46: {

 48: namespace cupm
 49: {

 51: namespace impl
 52: {

 54: // ==========================================================================================
 55: // MatDense_CUPM_Base
 56: //
 57: // A base class to separate out the CRTP code from the common CUPM stuff (like the composed
 58: // function names).
 59: // ==========================================================================================

 61: template <device::cupm::DeviceType T>
 62: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM_Base : protected device::cupm::impl::CUPMObject<T> {
 63: public:
 64:   PETSC_CUPMOBJECT_HEADER(T);

 66:   #define MatDenseCUPMComposedOpDecl(OP_NAME) \
 67:     PETSC_NODISCARD static constexpr const char *PetscConcat(MatDenseCUPM, OP_NAME)() noexcept \
 68:     { \
 69:       return T == device::cupm::DeviceType::CUDA ? PetscStringize(PetscConcat(MatDenseCUDA, OP_NAME)) : PetscStringize(PetscConcat(MatDenseHIP, OP_NAME)); \
 70:     }

 72:   // clang-format off
 73:   MatDenseCUPMComposedOpDecl(GetArray_C)
 74:   MatDenseCUPMComposedOpDecl(GetArrayRead_C)
 75:   MatDenseCUPMComposedOpDecl(GetArrayWrite_C)
 76:   MatDenseCUPMComposedOpDecl(RestoreArray_C)
 77:   MatDenseCUPMComposedOpDecl(RestoreArrayRead_C)
 78:   MatDenseCUPMComposedOpDecl(RestoreArrayWrite_C)
 79:   MatDenseCUPMComposedOpDecl(PlaceArray_C)
 80:   MatDenseCUPMComposedOpDecl(ReplaceArray_C)
 81:   MatDenseCUPMComposedOpDecl(ResetArray_C)
 82:   MatDenseCUPMComposedOpDecl(SetPreallocation_C)
 83:   // clang-format on

 85:   #undef MatDenseCUPMComposedOpDecl

 87:     PETSC_NODISCARD static constexpr MatType MATSEQDENSECUPM() noexcept;
 88:   PETSC_NODISCARD static constexpr MatType       MATMPIDENSECUPM() noexcept;
 89:   PETSC_NODISCARD static constexpr MatType       MATDENSECUPM() noexcept;
 90:   PETSC_NODISCARD static constexpr MatSolverType MATSOLVERCUPM() noexcept;
 91: };

 93: // ==========================================================================================
 94: // MatDense_CUPM_Base -- Public API
 95: // ==========================================================================================

 97: template <device::cupm::DeviceType T>
 98: inline constexpr MatType MatDense_CUPM_Base<T>::MATSEQDENSECUPM() noexcept
 99: {
100:   return T == device::cupm::DeviceType::CUDA ? MATSEQDENSECUDA : MATSEQDENSEHIP;
101: }

103: template <device::cupm::DeviceType T>
104: inline constexpr MatType MatDense_CUPM_Base<T>::MATMPIDENSECUPM() noexcept
105: {
106:   return T == device::cupm::DeviceType::CUDA ? MATMPIDENSECUDA : MATMPIDENSEHIP;
107: }

109: template <device::cupm::DeviceType T>
110: inline constexpr MatType MatDense_CUPM_Base<T>::MATDENSECUPM() noexcept
111: {
112:   return T == device::cupm::DeviceType::CUDA ? MATDENSECUDA : MATDENSEHIP;
113: }

115: template <device::cupm::DeviceType T>
116: inline constexpr MatSolverType MatDense_CUPM_Base<T>::MATSOLVERCUPM() noexcept
117: {
118:   return T == device::cupm::DeviceType::CUDA ? MATSOLVERCUDA : MATSOLVERHIP;
119: }

121:   #define MATDENSECUPM_BASE_HEADER(T) \
122:     PETSC_CUPMOBJECT_HEADER(T); \
123:     using VecSeq_CUPM = ::Petsc::vec::cupm::impl::VecSeq_CUPM<T>; \
124:     using VecMPI_CUPM = ::Petsc::vec::cupm::impl::VecMPI_CUPM<T>; \
125:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSEQDENSECUPM; \
126:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATMPIDENSECUPM; \
127:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATDENSECUPM; \
128:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MATSOLVERCUPM; \
129:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C; \
130:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C; \
131:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C; \
132:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C; \
133:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C; \
134:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C; \
135:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C; \
136:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C; \
137:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C; \
138:     using ::Petsc::mat::cupm::impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C

140: // forward declare
141: template <device::cupm::DeviceType>
142: class MatDense_Seq_CUPM;
143: template <device::cupm::DeviceType>
144: class MatDense_MPI_CUPM;

146: // ==========================================================================================
147: // MatDense_CUPM
148: //
149: // The true "base" class for MatDenseCUPM. The reason MatDense_CUPM and MatDense_CUPM_Base
150: // exist is to separate out the CRTP code from the non-crtp code so that the generic functions
151: // can be called via templates below.
152: // ==========================================================================================

154: template <device::cupm::DeviceType T, typename Derived>
155: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM : protected MatDense_CUPM_Base<T> {
156: private:
157:   static PetscErrorCode CheckSaneSequentialMatSizes_(Mat) noexcept;

159: protected:
160:   MATDENSECUPM_BASE_HEADER(T);

162:   template <PetscMemType, PetscMemoryAccessMode>
163:   class MatrixArray;

165:   // Cast the Mat to its host struct, i.e. return the result of (Mat_SeqDense *)m->data
166:   template <typename U = Derived>
167:   PETSC_NODISCARD static constexpr auto    MatIMPLCast(Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(U::MatIMPLCast_(m))
168:   PETSC_NODISCARD static constexpr MatType MATIMPLCUPM() noexcept;

170:   static PetscErrorCode CreateIMPLDenseCUPM(MPI_Comm, PetscInt, PetscInt, PetscInt, PetscInt, PetscScalar *, Mat *, PetscDeviceContext, bool) noexcept;
171:   static PetscErrorCode SetPreallocation(Mat, PetscDeviceContext, PetscScalar *) noexcept;

173:   template <typename F>
174:   static PetscErrorCode DiagonalUnaryTransform(Mat, PetscDeviceContext, F &&) noexcept;

176:   static PetscErrorCode Shift(Mat, PetscScalar) noexcept;
177:   static PetscErrorCode GetDiagonal(Mat, Vec) noexcept;

179:   PETSC_NODISCARD static auto DeviceArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>{dctx, m})
180:   PETSC_NODISCARD static auto DeviceArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
181:   PETSC_NODISCARD static auto DeviceArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
182:   PETSC_NODISCARD static auto HostArrayRead(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>{dctx, m})
183:   PETSC_NODISCARD static auto HostArrayWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>{dctx, m})
184:   PETSC_NODISCARD static auto HostArrayReadWrite(PetscDeviceContext dctx, Mat m) noexcept PETSC_DECLTYPE_AUTO_RETURNS(MatrixArray<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>{dctx, m})
185: };

187: // ==========================================================================================
188: // MatDense_CUPM::MatrixArray
189: // ==========================================================================================

191: template <device::cupm::DeviceType T, typename D>
192: template <PetscMemType MT, PetscMemoryAccessMode MA>
193: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_CUPM<T, D>::MatrixArray : public device::cupm::impl::RestoreableArray<T, MT, MA> {
194:   using base_type = device::cupm::impl::RestoreableArray<T, MT, MA>;

196: public:
197:   MatrixArray(PetscDeviceContext, Mat) noexcept;
198:   ~MatrixArray() noexcept;

200:   // must declare move constructor since we declare a destructor
201:   constexpr MatrixArray(MatrixArray &&) noexcept;

203: private:
204:   Mat m_ = nullptr;
205: };

207: // ==========================================================================================
208: // MatDense_CUPM::MatrixArray -- Public API
209: // ==========================================================================================

211: template <device::cupm::DeviceType T, typename D>
212: template <PetscMemType MT, PetscMemoryAccessMode MA>
213: inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(PetscDeviceContext dctx, Mat m) noexcept : base_type{dctx}, m_{m}
214: {
215:   PetscFunctionBegin;
216:   PetscCallAbort(PETSC_COMM_SELF, D::template GetArray<MT, MA>(m, &this->ptr_, dctx));
217:   PetscFunctionReturnVoid();
218: }

220: template <device::cupm::DeviceType T, typename D>
221: template <PetscMemType MT, PetscMemoryAccessMode MA>
222: inline MatDense_CUPM<T, D>::MatrixArray<MT, MA>::~MatrixArray() noexcept
223: {
224:   PetscFunctionBegin;
225:   PetscCallAbort(PETSC_COMM_SELF, D::template RestoreArray<MT, MA>(m_, &this->ptr_, this->dctx_));
226:   PetscFunctionReturnVoid();
227: }

229: template <device::cupm::DeviceType T, typename D>
230: template <PetscMemType MT, PetscMemoryAccessMode MA>
231: inline constexpr MatDense_CUPM<T, D>::MatrixArray<MT, MA>::MatrixArray(MatrixArray &&other) noexcept : base_type{std::move(other)}, m_{util::exchange(other.m_, nullptr)}
232: {
233: }

235: // ==========================================================================================
236: // MatDense_CUPM -- Private API
237: // ==========================================================================================

239: template <device::cupm::DeviceType T, typename D>
240: inline PetscErrorCode MatDense_CUPM<T, D>::CheckSaneSequentialMatSizes_(Mat A) noexcept
241: {
242:   PetscFunctionBegin;
243:   if (PetscDefined(USE_DEBUG)) {
244:     PetscBool isseq;

246:     PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), D::MATSEQDENSECUPM(), &isseq));
247:     if (isseq) {
248:       // doing this check allows both sequential and parallel implementations to just pass in
249:       // A, otherwise they would need to specify rstart, rend, and cols separately!
250:       PetscCheck(A->rmap->rstart == 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row start %" PetscInt_FMT " != 0?", A->rmap->rstart);
251:       PetscCheck(A->rmap->rend == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix row end %" PetscInt_FMT " != number of rows %" PetscInt_FMT, A->rmap->rend, A->rmap->n);
252:       PetscCheck(A->cmap->n == A->cmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Sequential matrix number of local columns %" PetscInt_FMT " != number of global columns %" PetscInt_FMT, A->cmap->n, A->cmap->n);
253:     }
254:   }
255:   PetscFunctionReturn(PETSC_SUCCESS);
256: }

258: // ==========================================================================================
259: // MatDense_CUPM -- Protected API
260: // ==========================================================================================

262: template <device::cupm::DeviceType T, typename D>
263: inline constexpr MatType MatDense_CUPM<T, D>::MATIMPLCUPM() noexcept
264: {
265:   return D::MATIMPLCUPM_();
266: }

268: // Common core for MatCreateSeqDenseCUPM() and MatCreateMPIDenseCUPM()
269: template <device::cupm::DeviceType T, typename D>
270: inline PetscErrorCode MatDense_CUPM<T, D>::CreateIMPLDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscScalar *data, Mat *A, PetscDeviceContext dctx, bool preallocate) noexcept
271: {
272:   Mat mat;

274:   PetscFunctionBegin;
275:   PetscAssertPointer(A, 7);
276:   PetscCall(MatCreate(comm, &mat));
277:   PetscCall(MatSetSizes(mat, m, n, M, N));
278:   PetscCall(MatSetType(mat, D::MATIMPLCUPM()));
279:   if (preallocate) {
280:     PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
281:     PetscCall(D::SetPreallocation(mat, dctx, data));
282:   }
283:   *A = mat;
284:   PetscFunctionReturn(PETSC_SUCCESS);
285: }

287: template <device::cupm::DeviceType T, typename D>
288: inline PetscErrorCode MatDense_CUPM<T, D>::SetPreallocation(Mat A, PetscDeviceContext dctx, PetscScalar *device_array) noexcept
289: {
290:   PetscFunctionBegin;
292:   // might be the local (sequential) matrix of a MatMPIDense_CUPM. Since this would be called
293:   // from the MPI matrix'es impl MATIMPLCUPM() would return MATMPIDENSECUPM().
295:   PetscCheckTypeNames(A, D::MATSEQDENSECUPM(), D::MATMPIDENSECUPM());
296:   PetscCall(PetscLayoutSetUp(A->rmap));
297:   PetscCall(PetscLayoutSetUp(A->cmap));
298:   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
299:   PetscCall(D::SetPreallocation_(A, dctx, device_array));
300:   A->preallocated = PETSC_TRUE;
301:   A->assembled    = PETSC_TRUE;
302:   PetscFunctionReturn(PETSC_SUCCESS);
303: }

305: namespace detail
306: {

308:   #if CCCL_VERSION >= 3000000
309: template <class T>
310: using iter_difference_t = cuda::std::iter_difference_t<T>;
311:   #else
312: template <class T>
313: using iter_difference_t = typename thrust::iterator_difference<T>::type;
314:   #endif

316: // ==========================================================================================
317: // MatrixIteratorBase
318: //
319: // A base class for creating thrust iterators over the local sub-matrix. This will set up the
320: // proper iterator definitions so thrust knows how to handle things properly. Template
321: // parameters are as follows:
322: //
323: // - Iterator:
324: // The type of the primary array iterator. Usually this is
325: // thrust::device_pointer::iterator.
326: //
327: // - IndexFunctor:
328: // This should be a functor which contains an operator() that when called with an index `i`,
329: // returns the i'th permuted index into the array. For example, it could return the i'th
330: // diagonal entry.
331: // ==========================================================================================
332: template <typename Iterator, typename IndexFunctor>
333: class MatrixIteratorBase {
334: public:
335:   using array_iterator_type = Iterator;
336:   using index_functor_type  = IndexFunctor;

338:   using difference_type     = iter_difference_t<array_iterator_type>;
339:   using CountingIterator    = thrust::counting_iterator<difference_type>;
340:   using TransformIterator   = thrust::transform_iterator<index_functor_type, CountingIterator>;
341:   using PermutationIterator = thrust::permutation_iterator<array_iterator_type, TransformIterator>;
342:   using iterator            = PermutationIterator; // type of the begin/end iterator

344:   constexpr MatrixIteratorBase(array_iterator_type first, array_iterator_type last, index_functor_type idx_func) noexcept : first{std::move(first)}, last{std::move(last)}, func{std::move(idx_func)} { }

346:   PETSC_NODISCARD iterator begin() const noexcept
347:   {
348:     return PermutationIterator{
349:       first, TransformIterator{CountingIterator{0}, func}
350:     };
351:   }

353: protected:
354:   array_iterator_type first;
355:   array_iterator_type last;
356:   index_functor_type  func;
357: };

359: // ==========================================================================================
360: // StridedIndexFunctor
361: //
362: // Iterator which permutes a linear index range into strided matrix indices. Usually used to
363: // get the diagonal.
364: // ==========================================================================================
365: template <typename T>
366: struct StridedIndexFunctor {
367:   PETSC_NODISCARD PETSC_HOSTDEVICE_INLINE_DECL constexpr T operator()(const T &i) const noexcept { return stride * i; }

369:   T stride;
370: };

372: template <typename Iterator>
373: class DiagonalIterator : public MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>> {
374: public:
375:   using base_type = MatrixIteratorBase<Iterator, StridedIndexFunctor<iter_difference_t<Iterator>>>;

377:   using difference_type = typename base_type::difference_type;
378:   using iterator        = typename base_type::iterator;

380:   constexpr DiagonalIterator(Iterator first, Iterator last, difference_type stride) noexcept : base_type{std::move(first), std::move(last), {stride}} { }

382:   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->last - this->first + this->func.stride - 1) / this->func.stride; }
383: };

385: template <typename T>
386: inline DiagonalIterator<typename thrust::device_vector<T>::iterator> MakeDiagonalIterator(T *data, PetscInt rstart, PetscInt rend, PetscInt cols, PetscInt lda) noexcept
387: {
388:   const auto        rend2 = std::min(rend, cols);
389:   const std::size_t begin = rstart * lda;
390:   const std::size_t end   = rend2 - rstart + rend2 * lda;
391:   const auto        dptr  = thrust::device_pointer_cast(data);

393:   return {dptr + begin, dptr + end, lda + 1};
394: }

396: } // namespace detail

398: template <device::cupm::DeviceType T, typename D>
399: template <typename F>
400: inline PetscErrorCode MatDense_CUPM<T, D>::DiagonalUnaryTransform(Mat A, PetscDeviceContext dctx, F &&functor) noexcept
401: {
402:   const auto rstart = A->rmap->rstart;
403:   const auto rend   = A->rmap->rend;
404:   const auto gcols  = A->cmap->N;
405:   const auto rend2  = std::min(rend, gcols);

407:   PetscFunctionBegin;
408:   PetscCall(CheckSaneSequentialMatSizes_(A));
409:   if (rend2 > rstart) {
410:     const auto   da = D::DeviceArrayReadWrite(dctx, A);
411:     cupmStream_t stream;
412:     PetscInt     lda;

414:     PetscCall(MatDenseGetLDA(A, &lda));
415:     PetscCall(D::GetHandlesFrom_(dctx, &stream));
416:     {
417:       auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);

419:       // clang-format off
420:       PetscCallThrust(
421:         THRUST_CALL(
422:           thrust::transform,
423:           stream,
424:           diagonal.begin(), diagonal.end(), diagonal.begin(),
425:           std::forward<F>(functor)
426:         )
427:       );
428:       // clang-format on
429:     }
430:     PetscCall(PetscLogGpuFlops(rend2 - rstart));
431:   }
432:   PetscFunctionReturn(PETSC_SUCCESS);
433: }

435: template <device::cupm::DeviceType T, typename D>
436: inline PetscErrorCode MatDense_CUPM<T, D>::Shift(Mat A, PetscScalar alpha) noexcept
437: {
438:   PetscDeviceContext dctx;

440:   PetscFunctionBegin;
441:   PetscCall(GetHandles_(&dctx));
442:   PetscCall(DiagonalUnaryTransform(A, dctx, device::cupm::functors::make_plus_equals(alpha)));
443:   PetscFunctionReturn(PETSC_SUCCESS);
444: }

446: template <device::cupm::DeviceType T, typename D>
447: inline PetscErrorCode MatDense_CUPM<T, D>::GetDiagonal(Mat A, Vec v) noexcept
448: {
449:   const auto         rstart = A->rmap->rstart;
450:   const auto         rend   = A->rmap->rend;
451:   const auto         gcols  = A->cmap->N;
452:   PetscInt           lda;
453:   PetscDeviceContext dctx;

455:   PetscFunctionBegin;
456:   PetscCall(CheckSaneSequentialMatSizes_(A));
457:   PetscCall(GetHandles_(&dctx));
458:   PetscCall(MatDenseGetLDA(A, &lda));
459:   {
460:     auto dv       = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
461:     auto da       = D::DeviceArrayRead(dctx, A);
462:     auto diagonal = detail::MakeDiagonalIterator(da.data(), rstart, rend, gcols, lda);
463:     // We must have this cast outside of THRUST_CALL(). Without it, GCC 6.4 - 7.5, and 11.3.0
464:     // throw spurious warnings:
465:     //
466:     // warning: 'MatDense_CUPM<...>::GetDiagonal(Mat, Vec)::' declared with greater
467:     // visibility than the type of its field 'MatDense_CUPM<...>::GetDiagonal(Mat,
468:     // Vec)::::' [-Wattributes]
469:     // 460 |     PetscCallThrust(
470:     //     |     ^~~~~~~~~~~~~~~~
471:     auto         dvp = thrust::device_pointer_cast(dv.data());
472:     cupmStream_t stream;

474:     PetscCall(GetHandlesFrom_(dctx, &stream));
475:     PetscCallThrust(THRUST_CALL(thrust::copy, stream, diagonal.begin(), diagonal.end(), dvp));
476:   }
477:   PetscFunctionReturn(PETSC_SUCCESS);
478: }

480:   #define MatComposeOp_CUPM(use_host, pobj, op_str, op_host, ...) \
481:     do { \
482:       if (use_host) { \
483:         PetscCall(PetscObjectComposeFunction(pobj, op_str, op_host)); \
484:       } else { \
485:         PetscCall(PetscObjectComposeFunction(pobj, op_str, __VA_ARGS__)); \
486:       } \
487:     } while (0)

489:   #define MatSetOp_CUPM(use_host, mat, op_name, op_host, ...) \
490:     do { \
491:       if (use_host) { \
492:         (mat)->ops->op_name = op_host; \
493:       } else { \
494:         (mat)->ops->op_name = __VA_ARGS__; \
495:       } \
496:     } while (0)

498:   #define MATDENSECUPM_HEADER(T, ...) \
499:     MATDENSECUPM_BASE_HEADER(T); \
500:     friend class ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>; \
501:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MatIMPLCast; \
502:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::MATIMPLCUPM; \
503:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::CreateIMPLDenseCUPM; \
504:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::SetPreallocation; \
505:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayRead; \
506:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayWrite; \
507:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DeviceArrayReadWrite; \
508:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayRead; \
509:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayWrite; \
510:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::HostArrayReadWrite; \
511:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::DiagonalUnaryTransform; \
512:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::Shift; \
513:     using ::Petsc::mat::cupm::impl::MatDense_CUPM<T, __VA_ARGS__>::GetDiagonal

515: } // namespace impl

517: // ==========================================================================================
518: // MatDense_CUPM -- Implementations
519: // ==========================================================================================

521: template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
522: inline PetscErrorCode MatDenseCUPMGetArray_Private(Mat A, PetscScalar **array) noexcept
523: {
524:   PetscFunctionBegin;
526:   PetscAssertPointer(array, 2);
527:   switch (access) {
528:   case PETSC_MEMORY_ACCESS_READ:
529:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayRead_C(), (Mat, PetscScalar **), (A, array));
530:     break;
531:   case PETSC_MEMORY_ACCESS_WRITE:
532:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArrayWrite_C(), (Mat, PetscScalar **), (A, array));
533:     break;
534:   case PETSC_MEMORY_ACCESS_READ_WRITE:
535:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMGetArray_C(), (Mat, PetscScalar **), (A, array));
536:     break;
537:   }
538:   if (PetscMemoryAccessWrite(access)) PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
539:   PetscFunctionReturn(PETSC_SUCCESS);
540: }

542: template <device::cupm::DeviceType T, PetscMemoryAccessMode access>
543: inline PetscErrorCode MatDenseCUPMRestoreArray_Private(Mat A, PetscScalar **array) noexcept
544: {
545:   PetscFunctionBegin;
547:   if (array) PetscAssertPointer(array, 2);
548:   switch (access) {
549:   case PETSC_MEMORY_ACCESS_READ:
550:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayRead_C(), (Mat, PetscScalar **), (A, array));
551:     break;
552:   case PETSC_MEMORY_ACCESS_WRITE:
553:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArrayWrite_C(), (Mat, PetscScalar **), (A, array));
554:     break;
555:   case PETSC_MEMORY_ACCESS_READ_WRITE:
556:     PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMRestoreArray_C(), (Mat, PetscScalar **), (A, array));
557:     break;
558:   }
559:   if (PetscMemoryAccessWrite(access)) {
560:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
561:     A->offloadmask = PETSC_OFFLOAD_GPU;
562:   }
563:   if (array) *array = nullptr;
564:   PetscFunctionReturn(PETSC_SUCCESS);
565: }

567: template <device::cupm::DeviceType T>
568: inline PetscErrorCode MatDenseCUPMGetArray(Mat A, PetscScalar **array) noexcept
569: {
570:   PetscFunctionBegin;
571:   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
572:   PetscFunctionReturn(PETSC_SUCCESS);
573: }

575: template <device::cupm::DeviceType T>
576: inline PetscErrorCode MatDenseCUPMGetArrayRead(Mat A, const PetscScalar **array) noexcept
577: {
578:   PetscFunctionBegin;
579:   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
580:   PetscFunctionReturn(PETSC_SUCCESS);
581: }

583: template <device::cupm::DeviceType T>
584: inline PetscErrorCode MatDenseCUPMGetArrayWrite(Mat A, PetscScalar **array) noexcept
585: {
586:   PetscFunctionBegin;
587:   PetscCall(MatDenseCUPMGetArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
588:   PetscFunctionReturn(PETSC_SUCCESS);
589: }

591: template <device::cupm::DeviceType T>
592: inline PetscErrorCode MatDenseCUPMRestoreArray(Mat A, PetscScalar **array) noexcept
593: {
594:   PetscFunctionBegin;
595:   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ_WRITE>(A, array));
596:   PetscFunctionReturn(PETSC_SUCCESS);
597: }

599: template <device::cupm::DeviceType T>
600: inline PetscErrorCode MatDenseCUPMRestoreArrayRead(Mat A, const PetscScalar **array) noexcept
601: {
602:   PetscFunctionBegin;
603:   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_READ>(A, const_cast<PetscScalar **>(array)));
604:   PetscFunctionReturn(PETSC_SUCCESS);
605: }

607: template <device::cupm::DeviceType T>
608: inline PetscErrorCode MatDenseCUPMRestoreArrayWrite(Mat A, PetscScalar **array) noexcept
609: {
610:   PetscFunctionBegin;
611:   PetscCall(MatDenseCUPMRestoreArray_Private<T, PETSC_MEMORY_ACCESS_WRITE>(A, array));
612:   PetscFunctionReturn(PETSC_SUCCESS);
613: }

615: template <device::cupm::DeviceType T>
616: inline PetscErrorCode MatDenseCUPMPlaceArray(Mat A, const PetscScalar *array) noexcept
617: {
618:   PetscFunctionBegin;
620:   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMPlaceArray_C(), (Mat, const PetscScalar *), (A, array));
621:   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
622:   A->offloadmask = PETSC_OFFLOAD_GPU;
623:   PetscFunctionReturn(PETSC_SUCCESS);
624: }

626: template <device::cupm::DeviceType T>
627: inline PetscErrorCode MatDenseCUPMReplaceArray(Mat A, const PetscScalar *array) noexcept
628: {
629:   PetscFunctionBegin;
631:   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMReplaceArray_C(), (Mat, const PetscScalar *), (A, array));
632:   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
633:   A->offloadmask = PETSC_OFFLOAD_GPU;
634:   PetscFunctionReturn(PETSC_SUCCESS);
635: }

637: template <device::cupm::DeviceType T>
638: inline PetscErrorCode MatDenseCUPMResetArray(Mat A) noexcept
639: {
640:   PetscFunctionBegin;
642:   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMResetArray_C(), (Mat), (A));
643:   PetscCall(PetscObjectStateIncrease(PetscObjectCast(A)));
644:   PetscFunctionReturn(PETSC_SUCCESS);
645: }

647: template <device::cupm::DeviceType T>
648: inline PetscErrorCode MatDenseCUPMSetPreallocation(Mat A, PetscScalar *device_data, PetscDeviceContext dctx = nullptr) noexcept
649: {
650:   PetscFunctionBegin;
652:   PetscUseMethod(A, impl::MatDense_CUPM_Base<T>::MatDenseCUPMSetPreallocation_C(), (Mat, PetscDeviceContext, PetscScalar *), (A, dctx, device_data));
653:   PetscFunctionReturn(PETSC_SUCCESS);
654: }

656: } // namespace cupm

658: } // namespace mat

660: } // namespace Petsc

662: #endif // __cplusplus