Actual source code: cupminterface.hpp
1: #pragma once
3: #include <petscdevice_cupm.h>
5: #include <petsc/private/cpputil.hpp>
6: #include <petsc/private/petscadvancedmacros.h>
8: #include <petsc/private/cpp/array.hpp>
10: namespace Petsc
11: {
13: namespace device
14: {
16: namespace cupm
17: {
19: // enum describing available cupm devices, this is used as the template parameter to any
20: // class subclassing the Interface or using it as a member variable
21: enum class DeviceType : int {
22: CUDA,
23: HIP
24: };
26: // clang-format off
27: static constexpr std::array<const char *const, 5> DeviceTypes = {
28: "cuda",
29: "hip",
30: "Petsc::device::cupm::DeviceType",
31: "Petsc::device::cupm::DeviceType::",
32: nullptr
33: };
34: // clang-format on
36: namespace impl
37: {
39: #define PetscCallCUPM_(__abort_fn__, __comm__, ...) \
40: do { \
41: PetscStackUpdateLine; \
42: const cupmError_t cerr_p_ = __VA_ARGS__; \
43: __abort_fn__(cerr_p_ == cupmSuccess, __comm__, PETSC_ERR_GPU, "%s error %d (%s) : %s", cupmName(), static_cast<PetscErrorCode>(cerr_p_), cupmGetErrorName(cerr_p_), cupmGetErrorString(cerr_p_)); \
44: } while (0)
46: // A backend agnostic PetscCallCUPM() function, this will only work inside the member
47: // functions of a class inheriting from CUPM::Interface. Thanks to __VA_ARGS__ templated
48: // functions can also be wrapped inline:
49: //
50: // PetscCallCUPM(foo());
51: #define PetscCallCUPM(...) PetscCallCUPM_(PetscCheck, PETSC_COMM_SELF, __VA_ARGS__)
52: #define PetscCallCUPMAbort(comm_, ...) PetscCallCUPM_(PetscCheckAbort, comm_, __VA_ARGS__)
54: // PETSC_CUPM_ALIAS_FUNCTION() - declaration to alias a cuda/hip function
55: //
56: // input params:
57: // our_name - the name of the alias
58: // their_name - the name of the function being aliased
59: //
60: // notes:
61: // see PETSC_ALIAS_FUNCTION() for the exact nature of the expansion
62: //
63: // example usage:
64: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc) ->
65: // template
66: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
67: // {
68: // return cudaMalloc(std::forward(args)...);
69: // }
70: //
71: // PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc) ->
72: // template
73: // static constexpr auto cupmMalloc(T&&... args) *noexcept and trailing return type deduction*
74: // {
75: // return hipMalloc(std::forward(args)...);
76: // }
77: #define PETSC_CUPM_ALIAS_FUNCTION(our_name, their_name) PETSC_ALIAS_FUNCTION(static our_name, their_name)
79: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE() - declaration to alias a cuda/hip function but
80: // discard the last N arguments
81: //
82: // input params:
83: // our_name - the name of the alias
84: // their_name - the name of the function being aliased
85: // N - integer constant [0, INT_MAX) dictating how many arguments to chop off the end
86: //
87: // notes:
88: // see PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS() for the exact nature of the expansion
89: //
90: // example use:
91: // PETSC_CUPM_ALIAS_FUNCTION_GOBBLE_COMMON(cupmMallocAsync, cudaMalloc, 1) ->
92: // template
93: // static constexpr auto cupmMallocAsync(T&&... args, Tend argend) *noexcept and trailing
94: // return type deduction*
95: // {
96: // (void)argend;
97: // return cudaMalloc(std::forward(args)...);
98: // }
99: #define PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(our_name, their_name, N) PETSC_ALIAS_FUNCTION_GOBBLE_NTH_LAST_ARGS(static our_name, their_name, N)
101: // Base class that holds functions and variables that don't require CUDA or HIP to be present
102: // on the system
103: template <DeviceType T>
104: struct InterfaceBase {
105: static const DeviceType type = T;
107: PETSC_NODISCARD static constexpr const char *cupmName() noexcept
108: {
109: static_assert(util::to_underlying(DeviceType::CUDA) == 0, "");
110: static_assert(util::to_underlying(DeviceType::HIP) == 1, "");
111: return std::get<util::to_underlying(T)>(DeviceTypes);
112: }
114: PETSC_NODISCARD static constexpr const char *cupmNAME() noexcept { return T == DeviceType::CUDA ? "CUDA" : "HIP"; }
116: PETSC_NODISCARD static constexpr PetscDeviceType PETSC_DEVICE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_DEVICE_CUDA : PETSC_DEVICE_HIP; }
118: PETSC_NODISCARD static constexpr PetscMemType PETSC_MEMTYPE_CUPM() noexcept { return T == DeviceType::CUDA ? PETSC_MEMTYPE_CUDA : PETSC_MEMTYPE_HIP; }
119: };
121: // declare the base class static member variables
122: template <DeviceType T>
123: const DeviceType InterfaceBase<T>::type;
125: #define PETSC_CUPM_BASE_CLASS_HEADER(T) \
126: using ::Petsc::device::cupm::impl::InterfaceBase<T>::type; \
127: using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmName; \
128: using ::Petsc::device::cupm::impl::InterfaceBase<T>::cupmNAME; \
129: using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_DEVICE_CUPM; \
130: using ::Petsc::device::cupm::impl::InterfaceBase<T>::PETSC_MEMTYPE_CUPM
132: // A templated C++ struct that defines the entire CUPM interface. Use of templating vs
133: // preprocessor macros allows us to use both interfaces simultaneously as well as easily
134: // import them into classes.
135: template <DeviceType>
136: struct InterfaceImpl;
138: #if PetscDefined(HAVE_CUDA)
139: template <>
140: struct InterfaceImpl<DeviceType::CUDA> : InterfaceBase<DeviceType::CUDA> {
141: PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::CUDA);
143: // typedefs
144: using cupmError_t = cudaError_t;
145: using cupmEvent_t = cudaEvent_t;
146: using cupmStream_t = cudaStream_t;
147: using cupmDeviceProp_t = cudaDeviceProp;
148: using cupmMemcpyKind_t = cudaMemcpyKind;
149: using cupmDeviceAttr_t = cudaDeviceAttr;
150: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), cuComplex, cuDoubleComplex>;
151: using cupmPointerAttributes_t = cudaPointerAttributes;
152: using cupmMemoryType_t = enum cudaMemoryType;
153: using cupmDim3 = dim3;
154: using cupmHostFn_t = cudaHostFn_t;
155: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
156: using cupmMemPool_t = cudaMemPool_t;
157: using cupmMemPoolAttr = cudaMemPoolAttr;
158: #else
159: using cupmMemPool_t = void *;
160: using cupmMemPoolAttr = unsigned int;
161: #endif
163: // values
164: static const auto cupmSuccess = cudaSuccess;
165: static const auto cupmErrorNotReady = cudaErrorNotReady;
166: static const auto cupmErrorDeviceAlreadyInUse = cudaErrorDeviceAlreadyInUse;
167: static const auto cupmErrorSetOnActiveProcess = cudaErrorSetOnActiveProcess;
168: static const auto cupmErrorStubLibrary =
169: #if PETSC_PKG_CUDA_VERSION_GE(11, 1, 0)
170: cudaErrorStubLibrary;
171: #else
172: cudaErrorInsufficientDriver;
173: #endif
175: static const auto cupmErrorNoDevice = cudaErrorNoDevice;
176: static const auto cupmStreamDefault = cudaStreamDefault;
177: static const auto cupmStreamNonBlocking = cudaStreamNonBlocking;
178: static const auto cupmDeviceMapHost = cudaDeviceMapHost;
179: static const auto cupmMemcpyHostToDevice = cudaMemcpyHostToDevice;
180: static const auto cupmMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
181: static const auto cupmMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
182: static const auto cupmMemcpyHostToHost = cudaMemcpyHostToHost;
183: static const auto cupmMemcpyDefault = cudaMemcpyDefault;
184: static const auto cupmMemoryTypeHost = cudaMemoryTypeHost;
185: static const auto cupmMemoryTypeDevice = cudaMemoryTypeDevice;
186: static const auto cupmMemoryTypeManaged = cudaMemoryTypeManaged;
187: static const auto cupmEventDisableTiming = cudaEventDisableTiming;
188: static const auto cupmHostAllocDefault = cudaHostAllocDefault;
189: static const auto cupmHostAllocWriteCombined = cudaHostAllocWriteCombined;
190: static const auto cupmMemPoolAttrReleaseThreshold =
191: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
192: cudaMemPoolAttrReleaseThreshold;
193: #else
194: cupmMemPoolAttr{0};
195: #endif
196: static const auto cupmDevAttrClockRate = cudaDevAttrClockRate;
197: static const auto cupmDevAttrMemoryClockRate = cudaDevAttrMemoryClockRate;
199: // error functions
200: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, cudaGetErrorName)
201: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, cudaGetErrorString)
202: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, cudaGetLastError)
204: // device management
205: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, cudaGetDeviceCount)
206: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, cudaGetDeviceProperties)
207: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, cudaDeviceGetAttribute)
208: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, cudaGetDevice)
209: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, cudaSetDevice)
210: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, cudaGetDeviceFlags)
211: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, cudaSetDeviceFlags)
212: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, cudaPointerGetAttributes)
213: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
214: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, cudaDeviceGetMemPool)
215: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, cudaMemPoolSetAttribute)
216: #else
217: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
218: {
219: *pool = nullptr;
220: return cupmSuccess;
221: }
223: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
224: #endif
225: // CUDA has no cudaInit() to match hipInit()
226: PETSC_NODISCARD static cupmError_t cupmInit(unsigned int) noexcept { return cudaFree(nullptr); }
228: // stream management
229: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, cudaEventCreate)
230: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, cudaEventCreateWithFlags)
231: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, cudaEventDestroy)
232: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, cudaEventRecord)
233: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, cudaEventSynchronize)
234: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, cudaEventElapsedTime)
235: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, cudaEventQuery)
236: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, cudaStreamCreate)
237: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, cudaStreamCreateWithFlags)
238: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, cudaStreamGetFlags)
239: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, cudaStreamDestroy)
240: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, cudaStreamWaitEvent)
241: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, cudaStreamQuery)
242: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, cudaStreamSynchronize)
243: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, cudaDeviceSynchronize)
244: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, cudaGetSymbolAddress)
246: // memory management
247: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, cudaFree)
248: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, cudaMalloc)
249: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
250: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, cudaFreeAsync)
251: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, cudaMallocAsync)
252: #else
253: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, cudaFree, 1)
254: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, cudaMalloc, 1)
255: #endif
256: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, cudaMemcpy)
257: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, cudaMemcpyAsync)
258: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, cudaMallocHost)
259: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, cudaFreeHost)
260: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, cudaMemset)
261: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 0)
262: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, cudaMemsetAsync)
263: #else
264: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemsetAsync, cudaMemset, 1)
265: #endif
266: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, cudaMemcpy2D)
267: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, cudaMemcpy2DAsync, 1)
268: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, cudaMemset2D)
269: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, cudaMemset2DAsync, 1)
271: // launch control
272: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, cudaLaunchHostFunc)
273: template <typename FunctionT, typename... KernelArgsT>
274: PETSC_NODISCARD static cudaError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, cudaStream_t stream, KernelArgsT &&...kernelArgs) noexcept
275: {
276: static_assert(!std::is_pointer<FunctionT>::value, "kernel function must not be passed by pointer");
277: void *args[] = {(void *)std::addressof(kernelArgs)...};
279: return cudaLaunchKernel<util::remove_reference_t<FunctionT>>(std::addressof(func), std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
280: }
281: };
282: #endif // PetscDefined(HAVE_CUDA)
284: #if PetscDefined(HAVE_HIP)
285: template <>
286: struct InterfaceImpl<DeviceType::HIP> : InterfaceBase<DeviceType::HIP> {
287: PETSC_CUPM_BASE_CLASS_HEADER(DeviceType::HIP);
289: // typedefs
290: using cupmError_t = hipError_t;
291: using cupmEvent_t = hipEvent_t;
292: using cupmStream_t = hipStream_t;
293: using cupmDeviceProp_t = hipDeviceProp_t;
294: using cupmMemcpyKind_t = hipMemcpyKind;
295: using cupmDeviceAttr_t = hipDeviceAttribute_t;
296: using cupmComplex_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), hipComplex, hipDoubleComplex>;
297: using cupmPointerAttributes_t = hipPointerAttribute_t;
298: using cupmMemoryType_t = enum hipMemoryType;
299: using cupmDim3 = dim3;
300: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
301: using cupmHostFn_t = hipHostFn_t;
302: using cupmMemPool_t = hipMemPool_t;
303: using cupmMemPoolAttr = hipMemPoolAttr;
304: #else
305: using cupmHostFn_t = void (*)(void *);
306: using cupmMemPool_t = void *;
307: using cupmMemPoolAttr = unsigned int;
308: #endif
310: // values
311: static const auto cupmSuccess = hipSuccess;
312: static const auto cupmErrorNotReady = hipErrorNotReady;
313: // see https://github.com/ROCm-Developer-Tools/HIP/blob/develop/bin/hipify-perl
314: static const auto cupmErrorDeviceAlreadyInUse = hipErrorContextAlreadyInUse;
315: static const auto cupmErrorSetOnActiveProcess = hipErrorSetOnActiveProcess;
316: // as of HIP v4.2 cudaErrorStubLibrary has no HIP equivalent
317: static const auto cupmErrorStubLibrary = hipErrorInsufficientDriver;
318: static const auto cupmErrorNoDevice = hipErrorNoDevice;
319: static const auto cupmStreamDefault = hipStreamDefault;
320: static const auto cupmStreamNonBlocking = hipStreamNonBlocking;
321: static const auto cupmDeviceMapHost = hipDeviceMapHost;
322: static const auto cupmMemcpyHostToDevice = hipMemcpyHostToDevice;
323: static const auto cupmMemcpyDeviceToHost = hipMemcpyDeviceToHost;
324: static const auto cupmMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
325: static const auto cupmMemcpyHostToHost = hipMemcpyHostToHost;
326: static const auto cupmMemcpyDefault = hipMemcpyDefault;
327: static const auto cupmMemoryTypeHost = hipMemoryTypeHost;
328: static const auto cupmMemoryTypeDevice = hipMemoryTypeDevice;
329: // see
330: // https://github.com/ROCm-Developer-Tools/HIP/blob/develop/include/hip/hip_runtime_api.h#L156
331: static const auto cupmMemoryTypeManaged = hipMemoryTypeUnified;
332: static const auto cupmEventDisableTiming = hipEventDisableTiming;
333: static const auto cupmHostAllocDefault = hipHostMallocDefault;
334: static const auto cupmHostAllocWriteCombined = hipHostMallocWriteCombined;
335: static const auto cupmMemPoolAttrReleaseThreshold =
336: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
337: hipMemPoolAttrReleaseThreshold;
338: #else
339: cupmMemPoolAttr{0};
340: #endif
341: static const auto cupmDevAttrClockRate = hipDeviceAttributeClockRate;
342: static const auto cupmDevAttrMemoryClockRate = hipDeviceAttributeMemoryClockRate;
344: // error functions
345: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorName, hipGetErrorName)
346: PETSC_CUPM_ALIAS_FUNCTION(cupmGetErrorString, hipGetErrorString)
347: PETSC_CUPM_ALIAS_FUNCTION(cupmGetLastError, hipGetLastError)
349: // device management
350: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceCount, hipGetDeviceCount)
351: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceProperties, hipGetDeviceProperties)
352: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetAttribute, hipDeviceGetAttribute)
353: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDevice, hipGetDevice)
354: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDevice, hipSetDevice)
355: PETSC_CUPM_ALIAS_FUNCTION(cupmGetDeviceFlags, hipGetDeviceFlags)
356: PETSC_CUPM_ALIAS_FUNCTION(cupmSetDeviceFlags, hipSetDeviceFlags)
357: PETSC_CUPM_ALIAS_FUNCTION(cupmPointerGetAttributes, hipPointerGetAttributes)
358: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
359: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceGetMemPool, hipDeviceGetMemPool)
360: PETSC_CUPM_ALIAS_FUNCTION(cupmMemPoolSetAttribute, hipMemPoolSetAttribute)
361: #else
362: PETSC_NODISCARD static cupmError_t cupmDeviceGetMemPool(cupmMemPool_t *pool, int) noexcept
363: {
364: *pool = nullptr;
365: return cupmSuccess;
366: }
368: PETSC_NODISCARD static cupmError_t cupmMemPoolSetAttribute(cupmMemPool_t, cupmMemPoolAttr, void *) noexcept { return cupmSuccess; }
369: #endif
370: PETSC_CUPM_ALIAS_FUNCTION(cupmInit, hipInit)
372: // stream management
373: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreate, hipEventCreate)
374: PETSC_CUPM_ALIAS_FUNCTION(cupmEventCreateWithFlags, hipEventCreateWithFlags)
375: PETSC_CUPM_ALIAS_FUNCTION(cupmEventDestroy, hipEventDestroy)
376: PETSC_CUPM_ALIAS_FUNCTION(cupmEventRecord, hipEventRecord)
377: PETSC_CUPM_ALIAS_FUNCTION(cupmEventSynchronize, hipEventSynchronize)
378: PETSC_CUPM_ALIAS_FUNCTION(cupmEventElapsedTime, hipEventElapsedTime)
379: PETSC_CUPM_ALIAS_FUNCTION(cupmEventQuery, hipEventQuery)
380: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreate, hipStreamCreate)
381: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamCreateWithFlags, hipStreamCreateWithFlags)
382: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamGetFlags, hipStreamGetFlags)
383: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamDestroy, hipStreamDestroy)
384: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamWaitEvent, hipStreamWaitEvent)
385: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamQuery, hipStreamQuery)
386: PETSC_CUPM_ALIAS_FUNCTION(cupmStreamSynchronize, hipStreamSynchronize)
387: PETSC_CUPM_ALIAS_FUNCTION(cupmDeviceSynchronize, hipDeviceSynchronize)
388: PETSC_CUPM_ALIAS_FUNCTION(cupmGetSymbolAddress, hipGetSymbolAddress)
390: // memory management
391: PETSC_CUPM_ALIAS_FUNCTION(cupmFree, hipFree)
392: PETSC_CUPM_ALIAS_FUNCTION(cupmMalloc, hipMalloc)
393: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
394: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocAsync, hipMallocAsync)
395: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeAsync, hipFreeAsync)
396: #else
397: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMallocAsync, hipMalloc, 1)
398: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmFreeAsync, hipFree, 1)
399: #endif
400: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy, hipMemcpy)
401: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpyAsync, hipMemcpyAsync)
402: // hipMallocHost is deprecated
403: PETSC_CUPM_ALIAS_FUNCTION(cupmMallocHost, hipHostMalloc)
404: // hipFreeHost is deprecated
405: PETSC_CUPM_ALIAS_FUNCTION(cupmFreeHost, hipHostFree)
406: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset, hipMemset)
407: PETSC_CUPM_ALIAS_FUNCTION(cupmMemsetAsync, hipMemsetAsync)
408: PETSC_CUPM_ALIAS_FUNCTION(cupmMemcpy2D, hipMemcpy2D)
409: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemcpy2DAsync, hipMemcpy2DAsync, 1)
410: PETSC_CUPM_ALIAS_FUNCTION(cupmMemset2D, hipMemset2D)
411: PETSC_CUPM_ALIAS_FUNCTION_GOBBLE(cupmMemset2DAsync, hipMemset2DAsync, 1)
413: // launch control
414: // HIP appears to only have hipLaunchHostFunc from 5.2.0 onwards
415: // https://github.com/ROCm-Developer-Tools/HIPIFY/blob/master/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md#7-execution-control=
416: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
417: PETSC_CUPM_ALIAS_FUNCTION(cupmLaunchHostFunc, hipLaunchHostFunc)
418: #else
419: PETSC_NODISCARD static hipError_t cupmLaunchHostFunc(hipStream_t stream, cupmHostFn_t fn, void *ctx) noexcept
420: {
421: // the only correct way to spoof this function is to do it synchronously...
422: auto herr = hipStreamSynchronize(stream);
423: if (PetscUnlikely(herr != hipSuccess)) return herr;
424: fn(ctx);
425: return herr;
426: }
427: #endif
429: template <typename FunctionT, typename... KernelArgsT>
430: PETSC_NODISCARD static hipError_t cupmLaunchKernel(FunctionT &&func, dim3 gridDim, dim3 blockDim, std::size_t sharedMem, hipStream_t stream, KernelArgsT &&...kernelArgs) noexcept
431: {
432: void *args[] = {(void *)std::addressof(kernelArgs)...};
434: return hipLaunchKernel((void *)func, std::move(gridDim), std::move(blockDim), args, sharedMem, std::move(stream));
435: }
436: };
437: #endif // PetscDefined(HAVE_HIP)
439: // shorthand for bringing all of the typedefs from the base Interface class into your own,
440: // it's annoying that c++ doesn't have a way to do this automatically
441: #define PETSC_CUPM_IMPL_CLASS_HEADER(T) \
442: PETSC_CUPM_BASE_CLASS_HEADER(T); \
443: /* types */ \
444: using cupmError_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmError_t; \
445: using cupmEvent_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEvent_t; \
446: using cupmStream_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStream_t; \
447: using cupmDeviceProp_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceProp_t; \
448: using cupmMemcpyKind_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyKind_t; \
449: using cupmDeviceAttr_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceAttr_t; \
450: using cupmComplex_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmComplex_t; \
451: using cupmPointerAttributes_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerAttributes_t; \
452: using cupmMemoryType_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryType_t; \
453: using cupmDim3 = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDim3; \
454: using cupmHostFn_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostFn_t; \
455: using cupmMemPool_t = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPool_t; \
456: using cupmMemPoolAttr = typename ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttr; \
457: /* variables */ \
458: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSuccess; \
459: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNotReady; \
460: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorDeviceAlreadyInUse; \
461: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorSetOnActiveProcess; \
462: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorStubLibrary; \
463: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmErrorNoDevice; \
464: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDefault; \
465: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamNonBlocking; \
466: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceMapHost; \
467: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToDevice; \
468: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToHost; \
469: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDeviceToDevice; \
470: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyHostToHost; \
471: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyDefault; \
472: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeHost; \
473: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeDevice; \
474: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemoryTypeManaged; \
475: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDisableTiming; \
476: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocDefault; \
477: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmHostAllocWriteCombined; \
478: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolAttrReleaseThreshold; \
479: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrClockRate; \
480: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDevAttrMemoryClockRate; \
481: /* functions */ \
482: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorName; \
483: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetErrorString; \
484: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetLastError; \
485: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceCount; \
486: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceProperties; \
487: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetAttribute; \
488: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDevice; \
489: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDevice; \
490: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetDeviceFlags; \
491: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmSetDeviceFlags; \
492: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmPointerGetAttributes; \
493: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceGetMemPool; \
494: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemPoolSetAttribute; \
495: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmInit; \
496: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreate; \
497: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventCreateWithFlags; \
498: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventDestroy; \
499: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventRecord; \
500: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventSynchronize; \
501: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventElapsedTime; \
502: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmEventQuery; \
503: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreate; \
504: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamCreateWithFlags; \
505: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamGetFlags; \
506: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamDestroy; \
507: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamWaitEvent; \
508: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamQuery; \
509: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmStreamSynchronize; \
510: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmDeviceSynchronize; \
511: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmGetSymbolAddress; \
512: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMalloc; \
513: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocAsync; \
514: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy; \
515: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpyAsync; \
516: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMallocHost; \
517: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset; \
518: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemsetAsync; \
519: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2D; \
520: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemcpy2DAsync; \
521: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2D; \
522: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmMemset2DAsync; \
523: using ::Petsc::device::cupm::impl::InterfaceImpl<T>::cupmLaunchHostFunc
525: #if PetscHasAttribute(always_inline)
526: // https://gcc.gnu.org/bugzilla//show_bug.cgi?id=109464
527: #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND __attribute__((always_inline))
528: #else
529: #define PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND
530: #endif
532: // The actual interface class
533: template <DeviceType T>
534: struct Interface : InterfaceImpl<T> {
535: private:
536: using interface_type = InterfaceImpl<T>;
538: public:
539: PETSC_CUPM_IMPL_CLASS_HEADER(T);
541: using cupmReal_t = util::conditional_t<PetscDefined(USE_REAL_SINGLE), float, double>;
542: using cupmScalar_t = util::conditional_t<PetscDefined(USE_COMPLEX), cupmComplex_t, cupmReal_t>;
544: PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t cupmScalarCast(PetscScalar s) noexcept
545: {
546: #if PetscDefined(USE_COMPLEX)
547: return cupmComplex_t{PetscRealPart(s), PetscImaginaryPart(s)};
548: #else
549: return static_cast<cupmScalar_t>(s);
550: #endif
551: }
553: PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmScalar_t *cupmScalarPtrCast(const PetscScalar *s) noexcept { return reinterpret_cast<const cupmScalar_t *>(s); }
555: PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmScalar_t *cupmScalarPtrCast(PetscScalar *s) noexcept { return reinterpret_cast<cupmScalar_t *>(s); }
557: PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr const cupmReal_t *cupmRealPtrCast(const PetscReal *s) noexcept { return reinterpret_cast<const cupmReal_t *>(s); }
559: PETSC_NODISCARD PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND static constexpr cupmReal_t *cupmRealPtrCast(PetscReal *s) noexcept { return reinterpret_cast<cupmReal_t *>(s); }
561: #if !defined(PETSC_PKG_CUDA_VERSION_GE)
562: #define PETSC_PKG_CUDA_VERSION_GE(...) 0
563: #define CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE
564: #endif
566: #if !defined(PETSC_PKG_HIP_VERSION_LT)
567: #define PETSC_PKG_HIP_VERSION_LT(...) 0
568: #define CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT
569: #endif
571: static PetscErrorCode PetscCUPMGetMemType(const void *data, PetscMemType *type, PetscBool *registered = nullptr, PetscBool *managed = nullptr) noexcept
572: {
573: cupmPointerAttributes_t attr;
574: cupmError_t cerr;
576: PetscFunctionBegin;
577: if (type) PetscAssertPointer(type, 2);
578: if (registered) {
579: PetscAssertPointer(registered, 3);
580: *registered = PETSC_FALSE;
581: }
582: if (managed) {
583: PetscAssertPointer(managed, 4);
584: *managed = PETSC_FALSE;
585: }
586: // Do not check error, instead reset it via GetLastError() since before CUDA 11.0, passing
587: // a host pointer returns cudaErrorInvalidValue
588: cerr = cupmPointerGetAttributes(&attr, data);
589: cerr = cupmGetLastError();
590: // HIP seems to always have used memoryType though
591: #if (defined(CUDART_VERSION) && (CUDART_VERSION < 10000)) || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
592: const auto mtype = attr.memoryType;
593: if (managed) *managed = static_cast<PetscBool>((cerr == cupmSuccess) && attr.isManaged);
594: #else
595: if (PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) && (T == DeviceType::CUDA)) PetscCallCUPM(cerr);
596: const auto mtype = attr.type;
597: if (managed) *managed = static_cast<PetscBool>(mtype == cupmMemoryTypeManaged);
598: #endif // CUDART_VERSION && CUDART_VERSION < 10000 || (defined(__HIP_PLATFORM_HCC__) && PETSC_PKG_HIP_VERSION_LT(5, 5, 0))
599: if (type) *type = ((cerr == cupmSuccess) && (mtype == cupmMemoryTypeDevice)) ? PETSC_MEMTYPE_CUPM() : PETSC_MEMTYPE_HOST;
600: if (registered && (cerr == cupmSuccess) && (mtype == cupmMemoryTypeHost)) *registered = PETSC_TRUE;
601: PetscFunctionReturn(PETSC_SUCCESS);
602: }
603: #if defined(CUPM_DEFINED_PETSC_PKG_CUDA_VERSION_GE)
604: #undef PETSC_PKG_CUDA_VERSION_GE
605: #endif
606: #if defined(CUPM_DEFINED_PETSC_PKG_HIP_VERSION_LT)
607: #undef PETSC_PKG_HIP_VERSION_LT
608: #endif
610: PETSC_NODISCARD static PETSC_CONSTEXPR_14 cupmMemcpyKind_t PetscDeviceCopyModeToCUPMMemcpyKind(PetscDeviceCopyMode mode) noexcept
611: {
612: switch (mode) {
613: case PETSC_DEVICE_COPY_HTOH:
614: return cupmMemcpyHostToHost;
615: case PETSC_DEVICE_COPY_HTOD:
616: return cupmMemcpyHostToDevice;
617: case PETSC_DEVICE_COPY_DTOD:
618: return cupmMemcpyDeviceToDevice;
619: case PETSC_DEVICE_COPY_DTOH:
620: return cupmMemcpyDeviceToHost;
621: case PETSC_DEVICE_COPY_AUTO:
622: return cupmMemcpyDefault;
623: }
624: PetscUnreachable();
625: return cupmMemcpyDefault;
626: }
628: // these change what the arguments mean, so need to namespace these
629: template <typename M>
630: static PetscErrorCode PetscCUPMMallocAsync(M **ptr, std::size_t n, cupmStream_t stream = nullptr) noexcept
631: {
632: static_assert(!std::is_void<M>::value, "");
634: PetscFunctionBegin;
635: PetscAssertPointer(ptr, 1);
636: *ptr = nullptr;
637: if (n) {
638: const auto bytes = n * sizeof(M);
639: // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
640: //
641: // TLD;DR: cudaMallocAsync() does not work with NVIDIA GPUDirect which OPENMPI uses to
642: // underpin its cuda-aware MPI implementation, so we cannot just async allocate
643: // blindly...
644: if (stream) {
645: PetscCallCUPM(cupmMallocAsync(reinterpret_cast<void **>(ptr), bytes, stream));
646: } else {
647: PetscCallCUPM(cupmMalloc(reinterpret_cast<void **>(ptr), bytes));
648: }
649: }
650: PetscFunctionReturn(PETSC_SUCCESS);
651: }
653: template <typename M>
654: static PetscErrorCode PetscCUPMMalloc(M **ptr, std::size_t n) noexcept
655: {
656: PetscFunctionBegin;
657: PetscCall(PetscCUPMMallocAsync(ptr, n));
658: PetscFunctionReturn(PETSC_SUCCESS);
659: }
661: template <typename M>
662: static PetscErrorCode PetscCUPMMallocHost(M **ptr, std::size_t n, unsigned int flags = cupmHostAllocDefault) noexcept
663: {
664: static_assert(!std::is_void<M>::value, "");
666: PetscFunctionBegin;
667: PetscAssertPointer(ptr, 1);
668: *ptr = nullptr;
669: if (n) PetscCallCUPM(cupmMallocHost(reinterpret_cast<void **>(ptr), n * sizeof(M), flags));
670: PetscFunctionReturn(PETSC_SUCCESS);
671: }
673: template <typename D>
674: static PetscErrorCode PetscCUPMMemcpyAsync(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr, bool use_async = false) noexcept
675: {
676: static_assert(!std::is_void<D>::value, "");
677: const auto size = n * sizeof(D);
679: PetscFunctionBegin;
680: if (PetscUnlikely(!n)) PetscFunctionReturn(PETSC_SUCCESS);
681: // cannot dereference (i.e. cannot call PetscAssertPointer() here)
682: PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
683: PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
684: // do early return after nullptr check since we need to check that they are not both nullptrs
685: if (PetscUnlikely(dest == src)) PetscFunctionReturn(PETSC_SUCCESS);
686: if (kind == cupmMemcpyHostToHost) {
687: // If we are HTOH it is cheaper to check if the stream is idle and do a basic mempcy()
688: // than it is to just call the vendor functions. This assumes of course that the stream
689: // accounts for both memory regions being "idle"
690: if (cupmStreamQuery(stream) == cupmSuccess) {
691: PetscCall(PetscMemcpy(dest, src, size));
692: PetscFunctionReturn(PETSC_SUCCESS);
693: }
694: // need to clear the potential cupmErrorNotReady generated by query above...
695: auto cerr = cupmGetLastError();
697: if (PetscUnlikely(cerr != cupmErrorNotReady)) PetscCallCUPM(cerr);
698: }
699: if (use_async || stream || (kind != cupmMemcpyDeviceToHost)) {
700: PetscCallCUPM(cupmMemcpyAsync(dest, src, size, kind, stream));
701: } else {
702: PetscCallCUPM(cupmMemcpy(dest, src, size, kind));
703: }
704: PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
705: PetscFunctionReturn(PETSC_SUCCESS);
706: }
708: template <typename D>
709: static PetscErrorCode PetscCUPMMemcpy(D *dest, const util::type_identity_t<D> *src, std::size_t n, cupmMemcpyKind_t kind) noexcept
710: {
711: PetscFunctionBegin;
712: PetscCall(PetscCUPMMemcpyAsync(dest, src, n, kind));
713: PetscFunctionReturn(PETSC_SUCCESS);
714: }
716: template <typename D>
717: static PetscErrorCode PetscCUPMMemcpy2DAsync(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind, cupmStream_t stream = nullptr)
718: {
719: static_assert(!std::is_void<D>::value, "");
720: const auto dest_pitch_bytes = dest_pitch * sizeof(D);
721: const auto src_pitch_bytes = src_pitch * sizeof(D);
722: const auto width_bytes = width * sizeof(D);
723: const auto size = height * width_bytes;
725: PetscFunctionBegin;
726: if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
727: PetscCheck(dest, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy to a NULL pointer");
728: PetscCheck(src, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to copy from a NULL pointer");
729: if (stream || (kind != cupmMemcpyDeviceToHost)) {
730: PetscCallCUPM(cupmMemcpy2DAsync(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind, stream));
731: } else {
732: PetscCallCUPM(cupmMemcpy2D(dest, dest_pitch_bytes, src, src_pitch_bytes, width_bytes, height, kind));
733: }
734: PetscCall(PetscLogCUPMMemcpyTransfer(kind, size));
735: PetscFunctionReturn(PETSC_SUCCESS);
736: }
738: template <typename D>
739: static PetscErrorCode PetscCUPMMemcpy2D(D *dest, std::size_t dest_pitch, const util::type_identity_t<D> *src, std::size_t src_pitch, std::size_t width, std::size_t height, cupmMemcpyKind_t kind)
740: {
741: PetscFunctionBegin;
742: PetscCall(PetscCUPMMemcpy2DAsync(dest, dest_pitch, src, src_pitch, width, height, kind));
743: PetscFunctionReturn(PETSC_SUCCESS);
744: }
746: template <typename M>
747: static PetscErrorCode PetscCUPMMemsetAsync(M *ptr, int value, std::size_t n, cupmStream_t stream = nullptr, bool use_async = false) noexcept
748: {
749: static_assert(!std::is_void<M>::value, "");
751: PetscFunctionBegin;
752: if (PetscLikely(n)) {
753: const auto bytes = n * sizeof(M);
755: PetscCheck(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", n);
756: if (stream || use_async) {
757: PetscCallCUPM(cupmMemsetAsync(ptr, value, bytes, stream));
758: } else {
759: PetscCallCUPM(cupmMemset(ptr, value, bytes));
760: }
761: }
762: PetscFunctionReturn(PETSC_SUCCESS);
763: }
765: template <typename M>
766: static PetscErrorCode PetscCUPMMemset(M *ptr, int value, std::size_t n) noexcept
767: {
768: PetscFunctionBegin;
769: PetscCall(PetscCUPMMemsetAsync(ptr, value, n));
770: PetscFunctionReturn(PETSC_SUCCESS);
771: }
773: template <typename D>
774: static PetscErrorCode PetscCUPMMemset2DAsync(D *ptr, std::size_t pitch, int value, std::size_t width, std::size_t height, cupmStream_t stream = nullptr)
775: {
776: static_assert(!std::is_void<D>::value, "");
777: const auto pitch_bytes = pitch * sizeof(D);
778: const auto width_bytes = width * sizeof(D);
779: const auto size = width_bytes * height;
781: PetscFunctionBegin;
782: if (PetscUnlikely(!size)) PetscFunctionReturn(PETSC_SUCCESS);
783: PetscAssert(ptr, PETSC_COMM_SELF, PETSC_ERR_POINTER, "Trying to memset a NULL pointer with size %zu != 0", size);
784: if (stream) {
785: PetscCallCUPM(cupmMemset2DAsync(ptr, pitch_bytes, value, width_bytes, height, stream));
786: } else {
787: PetscCallCUPM(cupmMemset2D(ptr, pitch_bytes, value, width_bytes, height));
788: }
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
792: // these we can transparently wrap, no need to namespace it to Petsc
793: template <typename M>
794: PETSC_NODISCARD static cupmError_t cupmFreeAsync(M &ptr, cupmStream_t stream = nullptr) noexcept
795: {
796: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
797: static_assert(!std::is_const<M>::value, "");
799: if (ptr) {
800: auto cerr = interface_type::cupmFreeAsync(std::forward<M>(ptr), stream);
802: ptr = nullptr;
803: if (PetscUnlikely(cerr != cupmSuccess)) return cerr;
804: }
805: return cupmSuccess;
806: }
808: PETSC_NODISCARD static cupmError_t cupmFreeAsync(std::nullptr_t ptr, cupmStream_t stream = nullptr) { return interface_type::cupmFreeAsync(ptr, stream); }
810: template <typename M>
811: PETSC_NODISCARD static cupmError_t cupmFree(M &ptr) noexcept
812: {
813: return cupmFreeAsync(ptr);
814: }
816: PETSC_NODISCARD static cupmError_t cupmFree(std::nullptr_t ptr) { return cupmFreeAsync(ptr); }
818: template <typename M>
819: PETSC_NODISCARD static cupmError_t cupmFreeHost(M &ptr) noexcept
820: {
821: static_assert(std::is_pointer<util::decay_t<M>>::value, "");
822: const auto cerr = interface_type::cupmFreeHost(std::forward<M>(ptr));
823: ptr = nullptr;
824: return cerr;
825: }
827: PETSC_NODISCARD static cupmError_t cupmFreeHost(std::nullptr_t ptr) { return interface_type::cupmFreeHost(ptr); }
829: // specific wrapper for device launch function, as the real function is a C routine and
830: // doesn't have variable arguments. The actual mechanics of this are a bit complicated but
831: // boils down to the fact that ultimately we pass a
832: //
833: // void *args[] = {(void*)&kernel_args...};
834: //
835: // to the kernel launcher. Since we pass void* this means implicit conversion does **not**
836: // happen to the kernel arguments so we must do it ourselves here. This function does this in
837: // 3 stages:
838: // 1. Enumerate the kernel arguments (cupmLaunchKernel)
839: // 2. Deduce the signature of func() and static_cast the kernel arguments to the type
840: // expected by func() using the enumeration above (deduceKernelCall)
841: // 3. Form the void* array with the converted arguments and call cuda/hipLaunchKernel with
842: // it. (interface_type::cupmLaunchKernel)
843: template <typename F, typename... Args>
844: PETSC_NODISCARD static cupmError_t cupmLaunchKernel(F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
845: {
846: return deduceKernelCall(util::index_sequence_for<Args...>{}, std::forward<F>(func), std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream), std::forward<Args>(kernelArgs)...);
847: }
849: template <std::size_t block_size = 256, std::size_t warp_size = 32, typename F, typename... Args>
850: static PetscErrorCode PetscCUPMLaunchKernel1D(std::size_t n, std::size_t sharedMem, cupmStream_t stream, F &&func, Args &&...kernelArgs) noexcept
851: {
852: static_assert(block_size > 0, "");
853: static_assert(warp_size > 0, "");
854: // want block_size to be a multiple of the warp_size
855: static_assert(block_size % warp_size == 0, "");
856: const auto nthread = std::min(n, block_size);
857: const auto nblock = (n + block_size - 1) / block_size;
859: PetscFunctionBegin;
860: // if n = 0 then nthread = 0, which is not allowed. rather than letting the user try to
861: // decipher cryptic 'cuda/hipErrorLaunchFailure' we explicitly check for zero here
862: PetscAssert(nthread, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Trying to launch kernel with grid/block size 0");
863: PetscCallCUPM(cupmLaunchKernel(std::forward<F>(func), (unsigned int)nblock, (unsigned int)nthread, sharedMem, stream, std::forward<Args>(kernelArgs)...));
864: PetscFunctionReturn(PETSC_SUCCESS);
865: }
867: private:
868: template <typename S, typename D, typename = void>
869: struct is_static_castable : std::false_type { };
871: template <typename S, typename D>
872: struct is_static_castable<S, D, util::void_t<decltype(static_cast<D>(std::declval<S>()))>> : std::true_type { };
874: template <typename D, typename S>
875: static constexpr util::enable_if_t<is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
876: {
877: return static_cast<D>(std::forward<S>(src));
878: }
880: template <typename D, typename S>
881: static constexpr util::enable_if_t<!is_static_castable<S, D>::value, D> cast_to(S &&src) noexcept
882: {
883: return const_cast<D>(std::forward<S>(src));
884: }
886: template <typename F, typename... Args, std::size_t... Idx>
887: PETSC_NODISCARD static cupmError_t deduceKernelCall(util::index_sequence<Idx...>, F &&func, cupmDim3 gridDim, cupmDim3 blockDim, std::size_t sharedMem, cupmStream_t stream, Args &&...kernelArgs) noexcept
888: {
889: // clang-format off
890: return interface_type::cupmLaunchKernel(
891: std::forward<F>(func),
892: std::move(gridDim), std::move(blockDim), std::move(sharedMem), std::move(stream),
893: // can't static_cast() here since the function argument type may be cv-qualified, in
894: // which case we would need to const_cast(). But you can only const_cast() indirect types
895: // (pointers, references). So we need a SFINAE monster that is a static_cast() if
896: // possible, and a const_cast() if not. We could just use a C-style cast which *would*
897: // work here since it tries the following and uses the first one that succeeds:
898: //
899: // 1. const_cast()
900: // 2. static_cast()
901: // 3. static_cast() then const_cast()
902: // 4. reinterpret_cast()...
903: //
904: // the issue however is the final reinterpret_cast(). We absolutely cannot get there
905: // because doing so would silently hide a ton of bugs, for example casting a PetscScalar
906: // * to double * in complex builds, a PetscInt * to int * in 64idx builds, etc.
907: cast_to<typename util::func_traits<F>::template arg<Idx>::type>(std::forward<Args>(kernelArgs))...
908: );
909: // clang-format on
910: }
912: static PetscErrorCode PetscLogCUPMMemcpyTransfer(cupmMemcpyKind_t kind, std::size_t size) noexcept
913: {
914: PetscFunctionBegin;
915: // only the explicit HTOD or DTOH are handled, since we either don't log the other cases
916: // (yet) or don't know the direction
917: if (kind == cupmMemcpyDeviceToHost) PetscCall(PetscLogGpuToCpu(static_cast<PetscLogDouble>(size)));
918: else if (kind == cupmMemcpyHostToDevice) PetscCall(PetscLogCpuToGpu(static_cast<PetscLogDouble>(size)));
919: else (void)size;
920: PetscFunctionReturn(PETSC_SUCCESS);
921: }
922: };
924: #undef PETSC_GCC_LINKER_UNDEFINED_REFERENCE_BUG_WORKAROUND
926: #define PETSC_CUPM_INHERIT_INTERFACE_TYPEDEFS_USING(T) \
927: PETSC_CUPM_IMPL_CLASS_HEADER(T); \
928: using cupmReal_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmReal_t; \
929: using cupmScalar_t = typename ::Petsc::device::cupm::impl::Interface<T>::cupmScalar_t; \
930: using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarCast; \
931: using ::Petsc::device::cupm::impl::Interface<T>::cupmScalarPtrCast; \
932: using ::Petsc::device::cupm::impl::Interface<T>::cupmRealPtrCast; \
933: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMGetMemType; \
934: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset; \
935: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemsetAsync; \
936: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMalloc; \
937: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocAsync; \
938: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMallocHost; \
939: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy; \
940: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpyAsync; \
941: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2D; \
942: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemcpy2DAsync; \
943: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMMemset2DAsync; \
944: using ::Petsc::device::cupm::impl::Interface<T>::cupmFree; \
945: using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeAsync; \
946: using ::Petsc::device::cupm::impl::Interface<T>::cupmFreeHost; \
947: using ::Petsc::device::cupm::impl::Interface<T>::cupmLaunchKernel; \
948: using ::Petsc::device::cupm::impl::Interface<T>::PetscCUPMLaunchKernel1D; \
949: using ::Petsc::device::cupm::impl::Interface<T>::PetscDeviceCopyModeToCUPMMemcpyKind
951: #if PetscDefined(HAVE_CUDA)
952: extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::CUDA>;
953: #endif
955: #if PetscDefined(HAVE_HIP)
956: extern template struct PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL Interface<DeviceType::HIP>;
957: #endif
959: } // namespace impl
961: } // namespace cupm
963: } // namespace device
965: } // namespace Petsc