Actual source code: petscdevice_cuda.h
1: #pragma once
3: #include <petscdevice.h>
4: #include <petscpkg_version.h>
6: /* MANSEC = Sys */
8: #if defined(__NVCC__) || defined(__CUDACC__)
9: #define PETSC_USING_NVCC 1
10: #endif
12: #if PetscDefined(HAVE_CUDA)
13: #include <cuda.h>
14: #include <cuda_runtime.h>
15: #include <cublas_v2.h>
16: #define DISABLE_CUSPARSE_DEPRECATED
17: #include <cusparse.h>
18: #include <cusolverDn.h>
19: #include <cusolverSp.h>
20: #include <cufft.h>
21: #include <curand.h>
22: #include <nvml.h> // NVML comes with the NVIDIA GPU driver
24: /* cuBLAS does not have cublasGetErrorName(). We create one on our own. */
25: PETSC_EXTERN const char *PetscCUBLASGetErrorName(cublasStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRCUBLAS macro */
26: PETSC_EXTERN const char *PetscCUSolverGetErrorName(cusolverStatus_t);
27: PETSC_EXTERN const char *PetscCUFFTGetErrorName(cufftResult);
29: /*MC
30: WaitForCUDA - Block the calling host thread until all previously queued work on the current CUDA device has completed
32: Synopsis:
33: #include <petscdevice_cuda.h>
34: cudaError_t WaitForCUDA(void)
36: Not Collective; No Fortran Support
38: Level: developer
40: Note:
41: Thin convenience wrapper around `cudaDeviceSynchronize()`. Marked for removal in favour of
42: explicit `PetscDeviceContext` synchronization.
44: .seealso: `PetscDeviceContext`, `PetscDeviceContextSynchronize()`, `WaitForHIP()`
45: M*/
46: /* REMOVE ME */
47: #define WaitForCUDA() cudaDeviceSynchronize()
49: /* CUDART_VERSION = 1000 x major + 10 x minor version */
51: /* Could not find exactly which CUDART_VERSION introduced cudaGetErrorName. At least it was in CUDA 8.0 (Sep. 2016) */
52: #if PETSC_PKG_CUDA_VERSION_GE(8, 0, 0)
53: #define PetscCallCUDAVoid(...) \
54: do { \
55: const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
56: PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
57: } while (0)
59: #define PetscCallCUDA(...) \
60: do { \
61: const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
62: PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
63: } while (0)
64: #else /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
65: #define PetscCallCUDA(...) \
66: do { \
67: const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
68: PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
69: } while (0)
71: #define PetscCallCUDAVoid(...) \
72: do { \
73: const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
74: PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
75: } while (0)
76: #endif /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
77: #define CHKERRCUDA(...) PetscCallCUDA(__VA_ARGS__)
79: #define PetscCUDACheckLaunch \
80: do { \
81: /* Check synchronous errors, i.e. pre-launch */ \
82: PetscCallCUDA(cudaGetLastError()); \
83: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
84: PetscCallCUDA(cudaDeviceSynchronize()); \
85: } while (0)
87: #define PetscCallCUBLAS(...) \
88: do { \
89: const cublasStatus_t _p_cublas_stat__ = __VA_ARGS__; \
90: if (PetscUnlikely(_p_cublas_stat__ != CUBLAS_STATUS_SUCCESS)) { \
91: const char *name = PetscCUBLASGetErrorName(_p_cublas_stat__); \
92: if (((_p_cublas_stat__ == CUBLAS_STATUS_NOT_INITIALIZED) || (_p_cublas_stat__ == CUBLAS_STATUS_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
93: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
94: "cuBLAS error %d (%s). " \
95: "Reports not initialized or alloc failed; " \
96: "this indicates the GPU may have run out resources", \
97: (PetscErrorCode)_p_cublas_stat__, name); \
98: } else { \
99: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuBLAS error %d (%s)", (PetscErrorCode)_p_cublas_stat__, name); \
100: } \
101: } \
102: } while (0)
103: #define CHKERRCUBLAS(...) PetscCallCUBLAS(__VA_ARGS__)
105: #if (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) /* According to cuda/10.1.168 on OLCF Summit */
106: #define PetscCallCUSPARSE(...) \
107: do { \
108: const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
109: if (PetscUnlikely(_p_cusparse_stat__)) { \
110: const char *name = cusparseGetErrorName(_p_cusparse_stat__); \
111: const char *descr = cusparseGetErrorString(_p_cusparse_stat__); \
112: PetscCheck((_p_cusparse_stat__ != CUSPARSE_STATUS_NOT_INITIALIZED) && (_p_cusparse_stat__ != CUSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
113: "cuSPARSE errorcode %d (%s) : %s.; " \
114: "this indicates the GPU has run out resources", \
115: (int)_p_cusparse_stat__, name, descr); \
116: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d (%s) : %s", (int)_p_cusparse_stat__, name, descr); \
117: } \
118: } while (0)
119: #else /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
120: #define PetscCallCUSPARSE(...) \
121: do { \
122: const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
123: PetscCheck(_p_cusparse_stat__ == CUSPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d", (PetscErrorCode)_p_cusparse_stat__); \
124: } while (0)
125: #endif /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
126: #define CHKERRCUSPARSE(...) PetscCallCUSPARSE(__VA_ARGS__)
128: #define PetscCallCUSOLVER(...) \
129: do { \
130: const cusolverStatus_t _p_cusolver_stat__ = __VA_ARGS__; \
131: if (PetscUnlikely(_p_cusolver_stat__ != CUSOLVER_STATUS_SUCCESS)) { \
132: const char *name = PetscCUSolverGetErrorName(_p_cusolver_stat__); \
133: if (((_p_cusolver_stat__ == CUSOLVER_STATUS_NOT_INITIALIZED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_ALLOC_FAILED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
134: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
135: "cuSolver error %d (%s). " \
136: "This indicates the GPU may have run out resources", \
137: (PetscErrorCode)_p_cusolver_stat__, name); \
138: } else { \
139: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSolver error %d (%s)", (PetscErrorCode)_p_cusolver_stat__, name); \
140: } \
141: } \
142: } while (0)
143: #define CHKERRCUSOLVER(...) PetscCallCUSOLVER(__VA_ARGS__)
145: #define PetscCallCUFFT(...) \
146: do { \
147: const cufftResult_t _p_cufft_stat__ = __VA_ARGS__; \
148: if (PetscUnlikely(_p_cufft_stat__ != CUFFT_SUCCESS)) { \
149: const char *name = PetscCUFFTGetErrorName(_p_cufft_stat__); \
150: if (((_p_cufft_stat__ == CUFFT_SETUP_FAILED) || (_p_cufft_stat__ == CUFFT_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
151: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
152: "cuFFT error %d (%s). " \
153: "Reports not initialized or alloc failed; " \
154: "this indicates the GPU has run out resources", \
155: (PetscErrorCode)_p_cufft_stat__, name); \
156: } else { \
157: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuFFT error %d (%s)", (PetscErrorCode)_p_cufft_stat__, name); \
158: } \
159: } \
160: } while (0)
161: #define CHKERRCUFFT(...) PetscCallCUFFT(__VA_ARGS__)
163: #define PetscCallCURAND(...) \
164: do { \
165: const curandStatus_t _p_curand_stat__ = __VA_ARGS__; \
166: if (PetscUnlikely(_p_curand_stat__ != CURAND_STATUS_SUCCESS)) { \
167: if (((_p_curand_stat__ == CURAND_STATUS_INITIALIZATION_FAILED) || (_p_curand_stat__ == CURAND_STATUS_ALLOCATION_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
168: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
169: "cuRAND error %d. " \
170: "Reports not initialized or alloc failed; " \
171: "this indicates the GPU has run out resources", \
172: (PetscErrorCode)_p_curand_stat__); \
173: } else { \
174: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuRand error %d", (PetscErrorCode)_p_curand_stat__); \
175: } \
176: } \
177: } while (0)
178: #define CHKERRCURAND(...) PetscCallCURAND(__VA_ARGS__)
180: PETSC_EXTERN cudaStream_t PetscDefaultCudaStream; // The default stream used by PETSc
181: PETSC_EXTERN PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *);
182: PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *);
183: PETSC_EXTERN PetscErrorCode PetscGetCurrentCUDAStream(cudaStream_t *);
185: #endif // PETSC_HAVE_CUDA
187: // these can also be defined in petscdevice_hip.h so we undef and define them *only* if the
188: // current compiler is NVCC. In this case if petscdevice_hip.h is included first, the macros
189: // would already be defined, but they would be empty since we cannot be using HCC at the same
190: // time.
191: #if PetscDefined(USING_NVCC)
192: #undef PETSC_HOST_DECL
193: #undef PETSC_DEVICE_DECL
194: #undef PETSC_KERNEL_DECL
195: #undef PETSC_SHAREDMEM_DECL
196: #undef PETSC_FORCEINLINE
197: #undef PETSC_CONSTMEM_DECL
199: #define PETSC_HOST_DECL __host__
200: #define PETSC_DEVICE_DECL __device__
201: #define PETSC_KERNEL_DECL __global__
202: #define PETSC_SHAREDMEM_DECL __shared__
203: #define PETSC_FORCEINLINE __forceinline__
204: #define PETSC_CONSTMEM_DECL __constant__
205: #endif
207: #if !defined(PETSC_HOST_DECL) // use HOST_DECL as canary
208: #define PETSC_HOST_DECL
209: #define PETSC_DEVICE_DECL
210: #define PETSC_KERNEL_DECL
211: #define PETSC_SHAREDMEM_DECL
212: #define PETSC_FORCEINLINE inline
213: #define PETSC_CONSTMEM_DECL
214: #endif
216: #if !defined(PETSC_DEVICE_DEFINED_DECLS_PRIVATE)
217: #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE
218: #define PETSC_HOSTDEVICE_DECL PETSC_HOST_DECL PETSC_DEVICE_DECL
219: #define PETSC_DEVICE_INLINE_DECL PETSC_DEVICE_DECL PETSC_FORCEINLINE
220: #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE
221: #endif