Actual source code: cupminterface.hpp

  1: #pragma once

  3: #include <petscdevice_cupm.h>

  5: #include <petsc/private/cpputil.hpp>
  6: #include <petsc/private/petscadvancedmacros.h>

  8: #include <petsc/private/cpp/array.hpp>

 10: namespace Petsc
 11: {

 13: namespace device
 14: {

 16: namespace cupm
 17: {

 19: // enum describing available cupm devices, this is used as the template parameter to any
 20: // class subclassing the Interface or using it as a member variable
 21: enum class DeviceType : int {
 22:   CUDA,
 23:   HIP
 24: };

 26: // clang-format off
 27: static constexpr std::array<const char *const, 5> DeviceTypes = {
 28:   "cuda",
 29:   "hip",
 30:   "Petsc::device::cupm::DeviceType",
 31:   "Petsc::device::cupm::DeviceType::",
 32:   nullptr
 33: };
 34: // clang-format on

 36: namespace impl
 37: {

 39: #define PetscCallCUPM_(__abort_fn__, __comm__, ...) \
 40:   do { \
 41:     PetscStackUpdateLine; \
 42:     const cupmError_t cerr_p_ = __VA_ARGS__; \
 43:     __abort_fn__(cerr_p_ == cupmSuccess, __comm__, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
 44:   } while (0)

 46: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
 47: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
 48: // functions can also be wrapped inline:
 49: //
 50: // PetscCallCUPM(foo());
 51: #define PetscCallCUPM(...)             PetscCallCUPM_(PetscCheck, PETSC_COMM_SELF, __VA_ARGS__)
 52: #define PetscCallCUPMAbort(comm_, ...) PetscCallCUPM_(PetscCheckAbort, comm_, __VA_ARGS__)

 54: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
 55: //
 56: // input params:
 57: // our_name   - the name of the alias
 58: // their_name - the name of the function being aliased
 59: //
 60: // notes:
 61: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
 62: //
 63: // example usage:
 64: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
 65: // template 
 66: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 67: // {
 68: //   return cudaMalloc(std::forward(args)...);
 69: // }
 70: //
 71: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
 72: // template 
 73: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
 74: // {
 75: //   return hipMalloc(std::forward(args)...);
 76: // }
 77: #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)

 79: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
 80: // discard the last N arguments
 81: //
 82: // input params:
 83: // our_name   - the name of the alias
 84: // their_name - the name of the function being aliased
 85: // N          - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
 86: //
 87: // notes:
 88: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
 89: //
 90: // example use:
 91: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
 92: // template 
 93: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
 94: // return type deduction*
 95: // {
 96: //   (void)argend;
 97: //   return cudaMalloc(std::forward(args)...);
 98: // }
 99: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)

101: // Base class that holds functions and variables that don't require CUDA or HIP to be present
102: // on the system
103: template <DeviceType T>
104: struct InterfaceBase {
105:   static const DeviceType type = T;

107:   PETSC_NODISCARD static constexpr const char *cupmName() noexcept
108:   {
109:     static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
110:     static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
111:     return std::get<util::to_underlying(T)>(DeviceTypes);
112:   }

114:   PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; }

116:   PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }

118:   PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
119: };

121: // declare the base class static member variables
122: template <DeviceType T>
123: const DeviceType InterfaceBase<T>::type;

125: #define PETSC_CUPM_BASE_CLASS_HEADER(T) \
126:   using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \
127:   using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \
128:   using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \
129:   using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \
130:   using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM

132: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
133: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
134: // import them into classes.
135: template <DeviceType>
136: struct InterfaceImpl;

138: #if PetscDefined(HAVE_CUDA)
139: template <>
140: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
141:   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA);

143:   // typedefs
144:   using cupmError_t             = cudaError_t;
145:   using cupmEvent_t             = cudaEvent_t;
146:   using cupmStream_t            = cudaStream_t;
147:   using cupmDeviceProp_t        = cudaDeviceProp;
148:   using cupmMemcpyKind_t        = cudaMemcpyKind;
149:   using cupmDeviceAttr_t        = cudaDeviceAttr;
150:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
151:   using cupmPointerAttributes_t = cudaPointerAttributes;
152:   using cupmMemoryType_t        = enum cudaMemoryType;
153:   using cupmDim3                = dim3;
154:   using cupmHostFn_t            = cudaHostFn_t;
155:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
156:   using cupmMemPool_t   = cudaMemPool_t;
157:   using cupmMemPoolAttr = cudaMemPoolAttr;
158:   #else
159:   using cupmMemPool_t   = void *;
160:   using cupmMemPoolAttr = unsigned int;
161:   #endif

163:   // values
164:   static const auto cupmSuccess                 = cudaSuccess;
165:   static const auto cupmErrorNotReady           = cudaErrorNotReady;
166:   static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
167:   static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
168:   static const auto cupmErrorStubLibrary =
169:   #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
170:     cudaErrorStubLibrary;
171:   #else
172:     cudaErrorInsufficientDriver;
173:   #endif

175:   static const auto cupmErrorNoDevice          = cudaErrorNoDevice;
176:   static const auto cupmStreamDefault          = cudaStreamDefault;
177:   static const auto cupmStreamNonBlocking      = cudaStreamNonBlocking;
178:   static const auto cupmDeviceMapHost          = cudaDeviceMapHost;
179:   static const auto cupmMemcpyHostToDevice     = cudaMemcpyHostToDevice;
180:   static const auto cupmMemcpyDeviceToHost     = cudaMemcpyDeviceToHost;
181:   static const auto cupmMemcpyDeviceToDevice   = cudaMemcpyDeviceToDevice;
182:   static const auto cupmMemcpyHostToHost       = cudaMemcpyHostToHost;
183:   static const auto cupmMemcpyDefault          = cudaMemcpyDefault;
184:   static const auto cupmMemoryTypeHost         = cudaMemoryTypeHost;
185:   static const auto cupmMemoryTypeDevice       = cudaMemoryTypeDevice;
186:   static const auto cupmMemoryTypeManaged      = cudaMemoryTypeManaged;
187:   static const auto cupmEventDisableTiming     = cudaEventDisableTiming;
188:   static const auto cupmHostAllocDefault       = cudaHostAllocDefault;
189:   static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
190:   static const auto cupmMemPoolAttrReleaseThreshold =
191:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
192:     cudaMemPoolAttrReleaseThreshold;
193:   #else
194:     cupmMemPoolAttr{0};
195:   #endif
196:   static const auto cupmDevAttrClockRate       = cudaDevAttrClockRate;
197:   static const auto cupmDevAttrMemoryClockRate = cudaDevAttrMemoryClockRate;

199:   // error functions
200:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
201:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
202:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)

204:   // device management
205:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
206:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
207:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, cudaDeviceGetAttribute)
208:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
209:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
210:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
211:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
212:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
213:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
214:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
215:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
216:   #else
217:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
218:   {
219:     *pool = nullptr;
220:     return cupmSuccess;
221:   }

223:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
224:   #endif
225:   // CUDA has no cudaInit() to match hipInit()
226:   PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }

228:   // stream management
229:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
230:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
231:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
232:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
233:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
234:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
235:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
236:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
237:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
238:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
239:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
240:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
241:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
242:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
243:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
244:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)

246:   // memory management
247:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
248:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
249:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
250:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
251:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
252:   #else
253:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
254:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
255:   #endif
256:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
257:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
258:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
259:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
260:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
261:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
262:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
263:   #else
264:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
265:   #endif
266:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D)
267:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1)
268:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D)
269:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1)

271:   // launch control
272:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
273:   template <typename FunctionT, typename... KernelArgsT>
274:   PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
275:   {
276:     static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");
277:     void *args[] = {(void *)std::addressof(kernelArgs)...};

279:     return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
280:   }
281: };
282: #endif // PetscDefined(HAVE_CUDA)

284: #if PetscDefined(HAVE_HIP)
285: template <>
286: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
287:   PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP);

289:   // typedefs
290:   using cupmError_t             = hipError_t;
291:   using cupmEvent_t             = hipEvent_t;
292:   using cupmStream_t            = hipStream_t;
293:   using cupmDeviceProp_t        = hipDeviceProp_t;
294:   using cupmMemcpyKind_t        = hipMemcpyKind;
295:   using cupmDeviceAttr_t        = hipDeviceAttribute_t;
296:   using cupmComplex_t           = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
297:   using cupmPointerAttributes_t = hipPointerAttribute_t;
298:   using cupmMemoryType_t        = enum hipMemoryType;
299:   using cupmDim3                = dim3;
300:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
301:   using cupmHostFn_t    = hipHostFn_t;
302:   using cupmMemPool_t   = hipMemPool_t;
303:   using cupmMemPoolAttr = hipMemPoolAttr;
304:   #else
305:   using cupmHostFn_t    = void (*)(void *);
306:   using cupmMemPool_t   = void *;
307:   using cupmMemPoolAttr = unsigned int;
308:   #endif

310:   // values
311:   static const auto cupmSuccess       = hipSuccess;
312:   static const auto cupmErrorNotReady = hipErrorNotReady;
313:   // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
314:   static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
315:   static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
316:   // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
317:   static const auto cupmErrorStubLibrary     = hipErrorInsufficientDriver;
318:   static const auto cupmErrorNoDevice        = hipErrorNoDevice;
319:   static const auto cupmStreamDefault        = hipStreamDefault;
320:   static const auto cupmStreamNonBlocking    = hipStreamNonBlocking;
321:   static const auto cupmDeviceMapHost        = hipDeviceMapHost;
322:   static const auto cupmMemcpyHostToDevice   = hipMemcpyHostToDevice;
323:   static const auto cupmMemcpyDeviceToHost   = hipMemcpyDeviceToHost;
324:   static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
325:   static const auto cupmMemcpyHostToHost     = hipMemcpyHostToHost;
326:   static const auto cupmMemcpyDefault        = hipMemcpyDefault;
327:   static const auto cupmMemoryTypeHost       = hipMemoryTypeHost;
328:   static const auto cupmMemoryTypeDevice     = hipMemoryTypeDevice;
329:   // see
330:   // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
331:   static const auto cupmMemoryTypeManaged      = hipMemoryTypeUnified;
332:   static const auto cupmEventDisableTiming     = hipEventDisableTiming;
333:   static const auto cupmHostAllocDefault       = hipHostMallocDefault;
334:   static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
335:   static const auto cupmMemPoolAttrReleaseThreshold =
336:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
337:     hipMemPoolAttrReleaseThreshold;
338:   #else
339:     cupmMemPoolAttr{0};
340:   #endif
341:   static const auto cupmDevAttrClockRate       = hipDeviceAttributeClockRate;
342:   static const auto cupmDevAttrMemoryClockRate = hipDeviceAttributeMemoryClockRate;

344:   // error functions
345:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
346:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
347:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)

349:   // device management
350:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
351:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
352:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, hipDeviceGetAttribute)
353:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
354:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
355:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
356:   PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
357:   PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
358:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
359:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
360:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
361:   #else
362:   PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
363:   {
364:     *pool = nullptr;
365:     return cupmSuccess;
366:   }

368:   PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
369:   #endif
370:   PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)

372:   // stream management
373:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
374:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
375:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
376:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
377:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
378:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
379:   PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
380:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
381:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
382:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
383:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
384:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
385:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
386:   PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
387:   PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
388:   PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)

390:   // memory management
391:   PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
392:   PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
393:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
394:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
395:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
396:   #else
397:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
398:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
399:   #endif
400:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
401:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
402:   // hipMallocHost is deprecated
403:   PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
404:   // hipFreeHost is deprecated
405:   PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
406:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
407:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
408:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D)
409:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1)
410:   PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D)
411:   PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1)

413:   // launch control
414:   // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
415:   // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
416:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
417:   PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
418:   #else
419:   PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
420:   {
421:     // the only correct way to spoof this function is to do it synchronously...
422:     auto herr = hipStreamSynchronize(stream);
423:     if (PetscUnlikely(herr != hipSuccess)) return herr;
424:     fn(ctx);
425:     return herr;
426:   }
427:   #endif

429:   template <typename FunctionT, typename... KernelArgsT>
430:   PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
431:   {
432:     void *args[] = {(void *)std::addressof(kernelArgs)...};

434:     return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
435:   }
436: };
437: #endif // PetscDefined(HAVE_HIP)

439: // shorthand for bringing all of the typedefs from the base Interface class into your own,
440: // it's annoying that c++ doesn't have a way to do this automatically
441: #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \
442:   PETSC_CUPM_BASE_CLASS_HEADER(T); \
443:   /* types */ \
444:   using cupmError_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \
445:   using cupmEvent_t             = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \
446:   using cupmStream_t            = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \
447:   using cupmDeviceProp_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \
448:   using cupmMemcpyKind_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \
449:   using cupmDeviceAttr_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceAttr_t; \
450:   using cupmComplex_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \
451:   using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \
452:   using cupmMemoryType_t        = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \
453:   using cupmDim3                = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \
454:   using cupmHostFn_t            = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostFn_t; \
455:   using cupmMemPool_t           = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \
456:   using cupmMemPoolAttr         = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \
457:   /* variables */ \
458:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \
459:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \
460:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \
461:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \
462:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \
463:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \
464:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \
465:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \
466:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \
467:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \
468:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \
469:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \
470:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \
471:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \
472:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \
473:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \
474:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \
475:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \
476:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \
477:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \
478:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \
479:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrClockRate; \
480:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrMemoryClockRate; \
481:   /* functions */ \
482:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \
483:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \
484:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \
485:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \
486:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \
487:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetAttribute; \
488:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \
489:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \
490:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \
491:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \
492:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \
493:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \
494:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \
495:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \
496:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \
497:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \
498:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \
499:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \
500:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \
501:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \
502:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \
503:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \
504:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \
505:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \
506:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \
507:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \
508:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \
509:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \
510:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \
511:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \
512:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \
513:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \
514:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \
515:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \
516:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \
517:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \
518:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \
519:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \
520:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \
521:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \
522:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \
523:   using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc

525: #if PetscHasAttribute(always_inline)
526:   // https://gcc.gnu.org/bugzilla//show_bug.cgi?id=109464
527:   #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND __attribute__((always_inline))
528: #else
529:   #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND
530: #endif

532: // The actual interface class
533: template <DeviceType T>
534: struct Interface : InterfaceImpl<T> {
535: private:
536:   using interface_type = InterfaceImpl<T>;

538: public:
539:   PETSC_CUPM_IMPL_CLASS_HEADER(T);

541:   using cupmReal_t   = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
542:   using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;

544:   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
545:   {
546: #if PetscDefined(USE_COMPLEX)
547:     return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
548: #else
549:     return static_cast<cupmScalar_t>(s);
550: #endif
551:   }

553:   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }

555:   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }

557:   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }

559:   PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }

561: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
562:   #define PETSC_PKG_CUDA_VERSION_GE(...) 0
563:   #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
564: #endif

566: #if !defined(PETSC_PKG_HIP_VERSION_LT)
567:   #define PETSC_PKG_HIP_VERSION_LT(...) 0
568:   #define CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT
569: #endif

571:   static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
572:   {
573:     cupmPointerAttributes_t attr;
574:     cupmError_t             cerr;

576:     PetscFunctionBegin;
577:     if (type) PetscAssertPointer(type, 2);
578:     if (registered) {
579:       PetscAssertPointer(registered, 3);
580:       *registered = PETSC_FALSE;
581:     }
582:     if (managed) {
583:       PetscAssertPointer(managed, 4);
584:       *managed = PETSC_FALSE;
585:     }
586:     // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
587:     // a host pointer returns cudaErrorInvalidValue
588:     cerr = cupmPointerGetAttributes(&attr, data);
589:     cerr = cupmGetLastError();
590:     // HIP seems to always have used memoryType though
591: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
592:     const auto mtype = attr.memoryType;
593:     if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
594: #else
595:     if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
596:     const auto mtype = attr.type;
597:     if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
598: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
599:     if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
600:     if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
601:     PetscFunctionReturn(PETSC_SUCCESS);
602:   }
603: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
604:   #undef PETSC_PKG_CUDA_VERSION_GE
605: #endif
606: #if defined(CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT)
607:   #undef PETSC_PKG_HIP_VERSION_LT
608: #endif

610:   PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
611:   {
612:     switch (mode) {
613:     case PETSC_DEVICE_COPY_HTOH:
614:       return cupmMemcpyHostToHost;
615:     case PETSC_DEVICE_COPY_HTOD:
616:       return cupmMemcpyHostToDevice;
617:     case PETSC_DEVICE_COPY_DTOD:
618:       return cupmMemcpyDeviceToDevice;
619:     case PETSC_DEVICE_COPY_DTOH:
620:       return cupmMemcpyDeviceToHost;
621:     case PETSC_DEVICE_COPY_AUTO:
622:       return cupmMemcpyDefault;
623:     }
624:     PetscUnreachable();
625:     return cupmMemcpyDefault;
626:   }

628:   // these change what the arguments mean, so need to namespace these
629:   template <typename M>
630:   static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
631:   {
632:     static_assert(!std::is_void<M>::value, "");

634:     PetscFunctionBegin;
635:     PetscAssertPointer(ptr, 1);
636:     *ptr = nullptr;
637:     if (n) {
638:       const auto bytes = n * sizeof(M);
639:       // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
640:       //
641:       // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
642:       // underpin its cuda-aware MPI implementation, so we cannot just async allocate
643:       // blindly...
644:       if (stream) {
645:         PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
646:       } else {
647:         PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
648:       }
649:     }
650:     PetscFunctionReturn(PETSC_SUCCESS);
651:   }

653:   template <typename M>
654:   static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
655:   {
656:     PetscFunctionBegin;
657:     PetscCall(PetscCUPMMallocAsync(ptr, n));
658:     PetscFunctionReturn(PETSC_SUCCESS);
659:   }

661:   template <typename M>
662:   static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
663:   {
664:     static_assert(!std::is_void<M>::value, "");

666:     PetscFunctionBegin;
667:     PetscAssertPointer(ptr, 1);
668:     *ptr = nullptr;
669:     if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
670:     PetscFunctionReturn(PETSC_SUCCESS);
671:   }

673:   template <typename D>
674:   static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
675:   {
676:     static_assert(!std::is_void<D>::value, "");
677:     const auto size = n * sizeof(D);

679:     PetscFunctionBegin;
680:     if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
681:     // cannot dereference (i.e. cannot call PetscAssertPointer() here)
682:     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
683:     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
684:     // do early return after nullptr check since we need to check that they are not both nullptrs
685:     if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
686:     if (kind == cupmMemcpyHostToHost) {
687:       // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
688:       // than it is to just call the vendor functions. This assumes of course that the stream
689:       // accounts for both memory regions being "idle"
690:       if (cupmStreamQuery(stream) == cupmSuccess) {
691:         PetscCall(PetscMemcpy(dest, src, size));
692:         PetscFunctionReturn(PETSC_SUCCESS);
693:       }
694:       // need to clear the potential cupmErrorNotReady generated by query above...
695:       auto cerr = cupmGetLastError();

697:       if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
698:     }
699:     if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
700:       PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
701:     } else {
702:       PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
703:     }
704:     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
705:     PetscFunctionReturn(PETSC_SUCCESS);
706:   }

708:   template <typename D>
709:   static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
710:   {
711:     PetscFunctionBegin;
712:     PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
713:     PetscFunctionReturn(PETSC_SUCCESS);
714:   }

716:   template <typename D>
717:   static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr)
718:   {
719:     static_assert(!std::is_void<D>::value, "");
720:     const auto dest_pitch_bytes = dest_pitch * sizeof(D);
721:     const auto src_pitch_bytes  = src_pitch * sizeof(D);
722:     const auto width_bytes      = width * sizeof(D);
723:     const auto size             = height * width_bytes;

725:     PetscFunctionBegin;
726:     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
727:     PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
728:     PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
729:     if (stream || (kind != cupmMemcpyDeviceToHost)) {
730:       PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream));
731:     } else {
732:       PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind));
733:     }
734:     PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
735:     PetscFunctionReturn(PETSC_SUCCESS);
736:   }

738:   template <typename D>
739:   static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind)
740:   {
741:     PetscFunctionBegin;
742:     PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind));
743:     PetscFunctionReturn(PETSC_SUCCESS);
744:   }

746:   template <typename M>
747:   static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
748:   {
749:     static_assert(!std::is_void<M>::value, "");

751:     PetscFunctionBegin;
752:     if (PetscLikely(n)) {
753:       const auto bytes = n * sizeof(M);

755:       PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
756:       if (stream || use_async) {
757:         PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
758:       } else {
759:         PetscCallCUPM(cupmMemset(ptr, value, bytes));
760:       }
761:     }
762:     PetscFunctionReturn(PETSC_SUCCESS);
763:   }

765:   template <typename M>
766:   static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
767:   {
768:     PetscFunctionBegin;
769:     PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
770:     PetscFunctionReturn(PETSC_SUCCESS);
771:   }

773:   template <typename D>
774:   static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr)
775:   {
776:     static_assert(!std::is_void<D>::value, "");
777:     const auto pitch_bytes = pitch * sizeof(D);
778:     const auto width_bytes = width * sizeof(D);
779:     const auto size        = width_bytes * height;

781:     PetscFunctionBegin;
782:     if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
783:     PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size);
784:     if (stream) {
785:       PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream));
786:     } else {
787:       PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height));
788:     }
789:     PetscFunctionReturn(PETSC_SUCCESS);
790:   }

792:   // these we can transparently wrap, no need to namespace it to Petsc
793:   template <typename M>
794:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
795:   {
796:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
797:     static_assert(!std::is_const<M>::value, "");

799:     if (ptr) {
800:       auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);

802:       ptr = nullptr;
803:       if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
804:     }
805:     return cupmSuccess;
806:   }

808:   PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }

810:   template <typename M>
811:   PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
812:   {
813:     return cupmFreeAsync(ptr);
814:   }

816:   PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }

818:   template <typename M>
819:   PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
820:   {
821:     static_assert(std::is_pointer<util::decay_t<M>>::value, "");
822:     const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
823:     ptr             = nullptr;
824:     return cerr;
825:   }

827:   PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }

829:   // specific wrapper for device launch function, as the real function is a C routine and
830:   // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
831:   // boils down to the fact that ultimately we pass a
832:   //
833:   // void *args[] = {(void*)&kernel_args...};
834:   //
835:   // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
836:   // happen to the kernel arguments so we must do it ourselves here. This function does this in
837:   // 3 stages:
838:   // 1. Enumerate the kernel arguments (cupmLaunchKernel)
839:   // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
840:   //    expected by func() using the enumeration above (deduceKernelCall)
841:   // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
842:   //    it. (interface_type::cupmLaunchKernel)
843:   template <typename F, typename... Args>
844:   PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
845:   {
846:     return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
847:   }

849:   template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
850:   static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
851:   {
852:     static_assert(block_size > 0, "");
853:     static_assert(warp_size > 0, "");
854:     // want block_size to be a multiple of the warp_size
855:     static_assert(block_size % warp_size == 0, "");
856:     const auto nthread = std::min(n, block_size);
857:     const auto nblock  = (n + block_size - 1) / block_size;

859:     PetscFunctionBegin;
860:     // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
861:     // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
862:     PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
863:     PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), (unsigned int)nblock, (unsigned int)nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
864:     PetscFunctionReturn(PETSC_SUCCESS);
865:   }

867: private:
868:   template <typename S, typename D, typename = void>
869:   struct is_static_castable : std::false_type { };

871:   template <typename S, typename D>
872:   struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };

874:   template <typename D, typename S>
875:   static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
876:   {
877:     return static_cast<D>(std::forward<S>(src));
878:   }

880:   template <typename D, typename S>
881:   static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
882:   {
883:     return const_cast<D>(std::forward<S>(src));
884:   }

886:   template <typename F, typename... Args, std::size_t... Idx>
887:   PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
888:   {
889:     // clang-format off
890:     return interface_type::cupmLaunchKernel(
891:       std::forward<F>(func),
892:       std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
893:       // can't static_cast() here since the function argument type may be cv-qualified, in
894:       // which case we would need to const_cast(). But you can only const_cast() indirect types
895:       // (pointers, references). So we need a SFINAE monster that is a static_cast() if
896:       // possible, and a const_cast() if not. We could just use a C-style cast which *would*
897:       // work here since it tries the following and uses the first one that succeeds:
898:       //
899:       // 1. const_cast()
900:       // 2. static_cast()
901:       // 3. static_cast() then const_cast()
902:       // 4. reinterpret_cast()...
903:       //
904:       // the issue however is the final reinterpret_cast(). We absolutely cannot get there
905:       // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
906:       // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
907:       cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
908:     );
909:     // clang-format on
910:   }

912:   static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept
913:   {
914:     PetscFunctionBegin;
915:     // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
916:     // (yet) or don't know the direction
917:     if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size)));
918:     else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size)));
919:     else (void)size;
920:     PetscFunctionReturn(PETSC_SUCCESS);
921:   }
922: };

924: #undef PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND

926: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \
927:   PETSC_CUPM_IMPL_CLASS_HEADER(T); \
928:   using cupmReal_t   = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \
929:   using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \
930:   using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \
931:   using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \
932:   using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \
933:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \
934:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \
935:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \
936:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \
937:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \
938:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \
939:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \
940:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \
941:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \
942:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \
943:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \
944:   using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \
945:   using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \
946:   using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \
947:   using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \
948:   using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \
949:   using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind

951: #if PetscDefined(HAVE_CUDA)
952: extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::CUDA>;
953: #endif

955: #if PetscDefined(HAVE_HIP)
956: extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::HIP>;
957: #endif

959: } // namespace impl

961: } // namespace cupm

963: } // namespace device

965: } // namespace Petsc