Actual source code: petscdevice_hip.h

  1: #pragma once

  3: #include <petscdevice.h>
  4: #include <petscpkg_version.h>

  6: /* MANSEC = Sys */
  7: /* SUBMANSEC = Device */

  9: #if defined(__HCC__) || (defined(__clang__) && defined(__HIP__))
 10:   #define PETSC_USING_HCC 1
 11: #endif

 13: #if PetscDefined(HAVE_HIP)
 14:   #include <hip/hip_runtime.h>

 16:   #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)

 18:     // cupmScalarPtrCast() returns hip{Float,Double}Complex while hipBLAS uses hipBlas{Float,Double}Complex, causing many VecCUPM errors like
 19:     // error: no matching function for call to 'cupmBlasXdot'.
 20:     // Before rocm-6.0, one can define ROCM_MATHLIBS_API_USE_HIP_COMPLEX to force rocm to 'typedef hipDoubleComplex hipBlasDoubleComplex' for example.
 21:     // Since then, ROCM_MATHLIBS_API_USE_HIP_COMPLEX is deprecated, and one can define HIPBLAS_V2 to use version 2 of hipBLAS that directly use hipDoubleComplex etc.
 22:     // Per AMD, HIPBLAS_V2 will be removed in the future so that hipBLAS only provides updated APIs (but not yet in 6.2.2 as of Sep. 27, 2024).
 23:     //
 24:     // see https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.0.0/functions.html#complex-datatypes
 25:     // and https://rocm.docs.amd.com/projects/hipBLAS/en/docs-6.2.2/functions.html#hipblas-v2-and-deprecations
 26:     #if PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
 27:       #define HIPBLAS_V2
 28:     #else
 29:       #define ROCM_MATHLIBS_API_USE_HIP_COMPLEX
 30:     #endif
 31:     #include <hipblas/hipblas.h>
 32:     #include <hipsparse/hipsparse.h>
 33:   #else
 34:     #include <hipblas.h>
 35:     #include <hipsparse.h>
 36:   #endif

 38:   #if PETSC_PKG_HIP_VERSION_LT(5, 4, 0)
 39:     #define HIPSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
 40:   #endif

 42:   #if defined(__HIP_PLATFORM_NVCC__)
 43:     #include <cusolverDn.h>
 44:   #else // __HIP_PLATFORM_HCC__
 45:     #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
 46:       #include <hipsolver/hipsolver.h>
 47:     #else
 48:       #include <hipsolver.h>
 49:     #endif
 50:   #endif                       // __HIP_PLATFORM_NVCC__
 51:   #include <hip/hip_complex.h> // for hipComplex, hipDoubleComplex

 53:   /*MC
 54:     WaitForHIP - Block the calling host thread until all previously queued work on the current HIP device has completed

 56:     Synopsis:
 57: #include <petscdevice_hip.h>
 58:     hipError_t WaitForHIP(void)

 60:     Not Collective; No Fortran Support

 62:     Level: developer

 64:     Note:
 65:     Thin convenience wrapper around `hipDeviceSynchronize()`. Marked for removal in favour of
 66:     explicit `PetscDeviceContext` synchronization.

 68: .seealso: `PetscDeviceContext`, `PetscDeviceContextSynchronize()`, `WaitForCUDA()`
 69: M*/
 70:   // REMOVE ME
 71:   #define WaitForHIP() hipDeviceSynchronize()

 73: /* hipBLAS, hipSPARSE and hipSolver does not have hip*GetErrorName(). We create one on our own. */
 74: PETSC_EXTERN const char *PetscHIPBLASGetErrorName(hipblasStatus_t);     /* PETSC_EXTERN since it is exposed by the CHKERRHIPBLAS macro */
 75: PETSC_EXTERN const char *PetscHIPSPARSEGetErrorName(hipsparseStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRHIPSPARSE macro */
 76: PETSC_EXTERN const char *PetscHIPSolverGetErrorName(hipsolverStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRHIPSOLVER macro */

 78:   #define PetscCallHIP(...) \
 79:     do { \
 80:       const hipError_t _p_hip_err__ = __VA_ARGS__; \
 81:       if (PetscUnlikely(_p_hip_err__ != hipSuccess)) { \
 82:         const char *name  = hipGetErrorName(_p_hip_err__); \
 83:         const char *descr = hipGetErrorString(_p_hip_err__); \
 84:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "hip error %d (%s) : %s", (PetscErrorCode)_p_hip_err__, name, descr); \
 85:       } \
 86:     } while (0)
 87:   #define CHKERRHIP(...) PetscCallHIP(__VA_ARGS__)

 89:   #define PetscHIPCheckLaunch \
 90:     do { \
 91:       /* Check synchronous errors, i.e. pre-launch */ \
 92:       PetscCallHIP(hipGetLastError()); \
 93:       /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
 94:       PetscCallHIP(hipDeviceSynchronize()); \
 95:     } while (0)

 97:   #define PetscCallHIPBLAS(...) \
 98:     do { \
 99:       const hipblasStatus_t _p_hipblas_stat__ = __VA_ARGS__; \
100:       if (PetscUnlikely(_p_hipblas_stat__ != HIPBLAS_STATUS_SUCCESS)) { \
101:         const char *name = PetscHIPBLASGetErrorName(_p_hipblas_stat__); \
102:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "hipBLAS error %d (%s)", (PetscErrorCode)_p_hipblas_stat__, name); \
103:       } \
104:     } while (0)
105:   #define CHKERRHIPBLAS(...) PetscCallHIPBLAS(__VA_ARGS__)

107:   #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
108:     /* HIPSPARSE & HIPSOLVER have better functionality with ROCm-4.5 or newer */
109:     #define PetscCallHIPSPARSE(...) \
110:       do { \
111:         const hipsparseStatus_t _p_hipsparse_stat__ = __VA_ARGS__; \
112:         if (PetscUnlikely(_p_hipsparse_stat__ != HIPSPARSE_STATUS_SUCCESS)) { \
113:           const char *name = PetscHIPSPARSEGetErrorName(_p_hipsparse_stat__); \
114:           PetscCheck((_p_hipsparse_stat__ != HIPSPARSE_STATUS_NOT_INITIALIZED) && (_p_hipsparse_stat__ != HIPSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, "hipSPARSE errorcode %d (%s): Reports not initialized or alloc failed; this indicates the GPU has run out resources", (int)_p_hipsparse_stat__, name); \
115:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "hipSPARSE errorcode %d (%s)", (int)_p_hipsparse_stat__, name); \
116:         } \
117:       } while (0)
118:     #define CHKERRHIPSPARSE(...) PetscCallHIPSPARSE(__VA_ARGS__)

120:     #define PetscCallHIPSOLVER(...) \
121:       do { \
122:         const hipsolverStatus_t _p_hipsolver_stat__ = __VA_ARGS__; \
123:         if (PetscUnlikely(_p_hipsolver_stat__ != HIPSOLVER_STATUS_SUCCESS)) { \
124:           const char *name = PetscHIPSolverGetErrorName(_p_hipsolver_stat__); \
125:           if (((_p_hipsolver_stat__ == HIPSOLVER_STATUS_NOT_INITIALIZED) || (_p_hipsolver_stat__ == HIPSOLVER_STATUS_ALLOC_FAILED) || (_p_hipsolver_stat__ == HIPSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_HIP)) { \
126:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
127:                     "hipSolver error %d (%s). " \
128:                     "This indicates the GPU may have run out resources", \
129:                     (PetscErrorCode)_p_hipsolver_stat__, name); \
130:           } else { \
131:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "hipSolver error %d (%s)", (PetscErrorCode)_p_hipsolver_stat__, name); \
132:           } \
133:         } \
134:       } while (0)
135:     #define CHKERRHIPSOLVER(...) PetscCallHIPSOLVER(__VA_ARGS__)

137:   #else /* PETSC_PKG_HIP_VERSION_GE(4,5,0) */
138:     /* hipSolver does not exist yet so we work around it
139:   rocSOLVER users rocBLAS for the handle
140:   * */
141:     #if defined(__HIP_PLATFORM_NVCC__)
142:       #include <cusolverDn.h>
143: typedef cusolverDnHandle_t hipsolverHandle_t;
144: typedef cusolverStatus_t   hipsolverStatus_t;

146: /* Alias hipsolverDestroy to cusolverDnDestroy */
147: static inline hipsolverStatus_t hipsolverDestroy(hipsolverHandle_t *hipsolverhandle)
148: {
149:   return cusolverDnDestroy(hipsolverhandle);
150: }

152: /* Alias hipsolverCreate to cusolverDnCreate */
153: static inline hipsolverStatus_t hipsolverCreate(hipsolverHandle_t *hipsolverhandle)
154: {
155:   return cusolverDnCreate(hipsolverhandle);
156: }

158: /* Alias hipsolverGetStream to cusolverDnGetStream */
159: static inline hipsolverStatus_t hipsolverGetStream(hipsolverHandle_t handle, hipStream_t *stream)
160: {
161:   return cusolverDnGetStream(handle, stream);
162: }

164: /* Alias hipsolverSetStream to cusolverDnSetStream */
165: static inline hipsolverStatus_t hipsolverSetStream(hipsolverHandle_t handle, hipStream_t stream)
166: {
167:   return cusolveDnSetStream(handle, stream);
168: }
169:     #else /* __HIP_PLATFORM_HCC__ */
170:       #include <rocsolver.h>
171:       #include <rocblas.h>
172: typedef rocblas_handle hipsolverHandle_t;
173: typedef rocblas_status hipsolverStatus_t;

175: /* Alias hipsolverDestroy to rocblas_destroy_handle */
176: static inline hipsolverStatus_t hipsolverDestroy(hipsolverHandle_t hipsolverhandle)
177: {
178:   return rocblas_destroy_handle(hipsolverhandle);
179: }

181: /* Alias hipsolverCreate to rocblas_destroy_handle */
182: static inline hipsolverStatus_t hipsolverCreate(hipsolverHandle_t *hipsolverhandle)
183: {
184:   return rocblas_create_handle(hipsolverhandle);
185: }

187: // Alias hipsolverGetStream to rocblas_get_stream
188: static inline hipsolverStatus_t hipsolverGetStream(hipsolverHandle_t handle, hipStream_t *stream)
189: {
190:   return rocblas_get_stream(handle, stream);
191: }

193: // Alias hipsolverSetStream to rocblas_set_stream
194: static inline hipsolverStatus_t hipsolverSetStream(hipsolverHandle_t handle, hipStream_t stream)
195: {
196:   return rocblas_set_stream(handle, stream);
197: }
198:     #endif // __HIP_PLATFORM_NVCC__
199:   #endif   /* PETSC_PKG_HIP_VERSION_GE(4,5,0) */
200: // REMOVE ME
201: PETSC_EXTERN hipStream_t    PetscDefaultHipStream; // The default stream used by PETSc
202: PETSC_EXTERN PetscErrorCode PetscHIPBLASGetHandle(hipblasHandle_t *);
203: PETSC_EXTERN PetscErrorCode PetscHIPSOLVERGetHandle(hipsolverHandle_t *);
204: PETSC_EXTERN PetscErrorCode PetscGetCurrentHIPStream(hipStream_t *);

206: #endif // PETSC_HAVE_HIP

208: // these can also be defined in petscdevice_cuda.h so we undef and define them *only* if the
209: // current compiler is HCC. In this case if petscdevice_cuda.h is included first, the macros
210: // would already be defined, but they would be empty since we cannot be using NVCC at the same
211: // time.
212: #if PetscDefined(USING_HCC)
213:   #undef PETSC_HOST_DECL
214:   #undef PETSC_DEVICE_DECL
215:   #undef PETSC_KERNEL_DECL
216:   #undef PETSC_SHAREDMEM_DECL
217:   #undef PETSC_FORCEINLINE
218:   #undef PETSC_CONSTMEM_DECL

220:   #define PETSC_HOST_DECL      __host__
221:   #define PETSC_DEVICE_DECL    __device__
222:   #define PETSC_KERNEL_DECL    __global__
223:   #define PETSC_SHAREDMEM_DECL __shared__
224:   #define PETSC_FORCEINLINE    __forceinline__
225:   #define PETSC_CONSTMEM_DECL  __constant__
226: #endif

228: #if !defined(PETSC_HOST_DECL) // use HOST_DECL as canary
229:   #define PETSC_HOST_DECL
230:   #define PETSC_DEVICE_DECL
231:   #define PETSC_KERNEL_DECL
232:   #define PETSC_SHAREDMEM_DECL
233:   #define PETSC_FORCEINLINE inline
234:   #define PETSC_CONSTMEM_DECL
235: #endif

237: #if !defined(PETSC_DEVICE_DEFINED_DECLS_PRIVATE)
238:   #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE
239:   #define PETSC_HOSTDEVICE_DECL        PETSC_HOST_DECL PETSC_DEVICE_DECL
240:   #define PETSC_DEVICE_INLINE_DECL     PETSC_DEVICE_DECL PETSC_FORCEINLINE
241:   #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE
242: #endif