Actual source code: petscdevice_cuda.h

  1: #pragma once

  3: #include <petscdevice.h>
  4: #include <petscpkg_version.h>

  6: /* MANSEC = Sys */

  8: #if defined(__NVCC__) || defined(__CUDACC__)
  9:   #define PETSC_USING_NVCC 1
 10: #endif

 12: #if PetscDefined(HAVE_CUDA)
 13:   #include <cuda.h>
 14:   #include <cuda_runtime.h>
 15:   #include <cublas_v2.h>
 16:   #define DISABLE_CUSPARSE_DEPRECATED
 17:   #include <cusparse.h>
 18:   #include <cusolverDn.h>
 19:   #include <cusolverSp.h>
 20:   #include <cufft.h>
 21:   #include <curand.h>
 22:   #include <nvml.h> // NVML comes with the NVIDIA GPU driver

 24: /* cuBLAS does not have cublasGetErrorName(). We create one on our own. */
 25: PETSC_EXTERN const char *PetscCUBLASGetErrorName(cublasStatus_t); /* PETSC_EXTERN since it is exposed by the CHKERRCUBLAS macro */
 26: PETSC_EXTERN const char *PetscCUSolverGetErrorName(cusolverStatus_t);
 27: PETSC_EXTERN const char *PetscCUFFTGetErrorName(cufftResult);

 29:   /*MC
 30:     WaitForCUDA - Block the calling host thread until all previously queued work on the current CUDA device has completed

 32:     Synopsis:
 33: #include <petscdevice_cuda.h>
 34:     cudaError_t WaitForCUDA(void)

 36:     Not Collective; No Fortran Support

 38:     Level: developer

 40:     Note:
 41:     Thin convenience wrapper around `cudaDeviceSynchronize()`. Marked for removal in favour of
 42:     explicit `PetscDeviceContext` synchronization.

 44: .seealso: `PetscDeviceContext`, `PetscDeviceContextSynchronize()`, `WaitForHIP()`
 45: M*/
 46:   /* REMOVE ME */
 47:   #define WaitForCUDA() cudaDeviceSynchronize()

 49:   /* CUDART_VERSION = 1000 x major + 10 x minor version */

 51:   /* Could not find exactly which CUDART_VERSION introduced cudaGetErrorName. At least it was in CUDA 8.0 (Sep. 2016) */
 52:   #if PETSC_PKG_CUDA_VERSION_GE(8, 0, 0)
 53:     #define PetscCallCUDAVoid(...) \
 54:       do { \
 55:         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
 56:         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
 57:       } while (0)

 59:     #define PetscCallCUDA(...) \
 60:       do { \
 61:         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
 62:         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d (%s) : %s", (PetscErrorCode)_p_cuda_err__, cudaGetErrorName(_p_cuda_err__), cudaGetErrorString(_p_cuda_err__)); \
 63:       } while (0)
 64:   #else /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
 65:     #define PetscCallCUDA(...) \
 66:       do { \
 67:         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
 68:         PetscCheck(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
 69:       } while (0)

 71:     #define PetscCallCUDAVoid(...) \
 72:       do { \
 73:         const cudaError_t _p_cuda_err__ = __VA_ARGS__; \
 74:         PetscCheckAbort(_p_cuda_err__ == cudaSuccess, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuda error %d", (PetscErrorCode)_p_cuda_err__); \
 75:       } while (0)
 76:   #endif /* PETSC_PKG_CUDA_VERSION_GE(8,0,0) */
 77:   #define CHKERRCUDA(...) PetscCallCUDA(__VA_ARGS__)

 79:   #define PetscCUDACheckLaunch \
 80:     do { \
 81:       /* Check synchronous errors, i.e. pre-launch */ \
 82:       PetscCallCUDA(cudaGetLastError()); \
 83:       /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
 84:       PetscCallCUDA(cudaDeviceSynchronize()); \
 85:     } while (0)

 87:   #define PetscCallCUBLAS(...) \
 88:     do { \
 89:       const cublasStatus_t _p_cublas_stat__ = __VA_ARGS__; \
 90:       if (PetscUnlikely(_p_cublas_stat__ != CUBLAS_STATUS_SUCCESS)) { \
 91:         const char *name = PetscCUBLASGetErrorName(_p_cublas_stat__); \
 92:         if (((_p_cublas_stat__ == CUBLAS_STATUS_NOT_INITIALIZED) || (_p_cublas_stat__ == CUBLAS_STATUS_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
 93:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
 94:                   "cuBLAS error %d (%s). " \
 95:                   "Reports not initialized or alloc failed; " \
 96:                   "this indicates the GPU may have run out resources", \
 97:                   (PetscErrorCode)_p_cublas_stat__, name); \
 98:         } else { \
 99:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuBLAS error %d (%s)", (PetscErrorCode)_p_cublas_stat__, name); \
100:         } \
101:       } \
102:     } while (0)
103:   #define CHKERRCUBLAS(...) PetscCallCUBLAS(__VA_ARGS__)

105:   #if (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) /* According to cuda/10.1.168 on OLCF Summit */
106:     #define PetscCallCUSPARSE(...) \
107:       do { \
108:         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
109:         if (PetscUnlikely(_p_cusparse_stat__)) { \
110:           const char *name  = cusparseGetErrorName(_p_cusparse_stat__); \
111:           const char *descr = cusparseGetErrorString(_p_cusparse_stat__); \
112:           PetscCheck((_p_cusparse_stat__ != CUSPARSE_STATUS_NOT_INITIALIZED) && (_p_cusparse_stat__ != CUSPARSE_STATUS_ALLOC_FAILED), PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
113:                      "cuSPARSE errorcode %d (%s) : %s.; " \
114:                      "this indicates the GPU has run out resources", \
115:                      (int)_p_cusparse_stat__, name, descr); \
116:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d (%s) : %s", (int)_p_cusparse_stat__, name, descr); \
117:         } \
118:       } while (0)
119:   #else /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
120:     #define PetscCallCUSPARSE(...) \
121:       do { \
122:         const cusparseStatus_t _p_cusparse_stat__ = __VA_ARGS__; \
123:         PetscCheck(_p_cusparse_stat__ == CUSPARSE_STATUS_SUCCESS, PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSPARSE errorcode %d", (PetscErrorCode)_p_cusparse_stat__); \
124:       } while (0)
125:   #endif /* (CUSPARSE_VER_MAJOR > 10 || CUSPARSE_VER_MAJOR == 10 && CUSPARSE_VER_MINOR >= 2) */
126:   #define CHKERRCUSPARSE(...) PetscCallCUSPARSE(__VA_ARGS__)

128:   #define PetscCallCUSOLVER(...) \
129:     do { \
130:       const cusolverStatus_t _p_cusolver_stat__ = __VA_ARGS__; \
131:       if (PetscUnlikely(_p_cusolver_stat__ != CUSOLVER_STATUS_SUCCESS)) { \
132:         const char *name = PetscCUSolverGetErrorName(_p_cusolver_stat__); \
133:         if (((_p_cusolver_stat__ == CUSOLVER_STATUS_NOT_INITIALIZED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_ALLOC_FAILED) || (_p_cusolver_stat__ == CUSOLVER_STATUS_INTERNAL_ERROR)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
134:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
135:                   "cuSolver error %d (%s). " \
136:                   "This indicates the GPU may have run out resources", \
137:                   (PetscErrorCode)_p_cusolver_stat__, name); \
138:         } else { \
139:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuSolver error %d (%s)", (PetscErrorCode)_p_cusolver_stat__, name); \
140:         } \
141:       } \
142:     } while (0)
143:   #define CHKERRCUSOLVER(...) PetscCallCUSOLVER(__VA_ARGS__)

145:   #define PetscCallCUFFT(...) \
146:     do { \
147:       const cufftResult_t _p_cufft_stat__ = __VA_ARGS__; \
148:       if (PetscUnlikely(_p_cufft_stat__ != CUFFT_SUCCESS)) { \
149:         const char *name = PetscCUFFTGetErrorName(_p_cufft_stat__); \
150:         if (((_p_cufft_stat__ == CUFFT_SETUP_FAILED) || (_p_cufft_stat__ == CUFFT_ALLOC_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
151:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
152:                   "cuFFT error %d (%s). " \
153:                   "Reports not initialized or alloc failed; " \
154:                   "this indicates the GPU has run out resources", \
155:                   (PetscErrorCode)_p_cufft_stat__, name); \
156:         } else { \
157:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuFFT error %d (%s)", (PetscErrorCode)_p_cufft_stat__, name); \
158:         } \
159:       } \
160:     } while (0)
161:   #define CHKERRCUFFT(...) PetscCallCUFFT(__VA_ARGS__)

163:   #define PetscCallCURAND(...) \
164:     do { \
165:       const curandStatus_t _p_curand_stat__ = __VA_ARGS__; \
166:       if (PetscUnlikely(_p_curand_stat__ != CURAND_STATUS_SUCCESS)) { \
167:         if (((_p_curand_stat__ == CURAND_STATUS_INITIALIZATION_FAILED) || (_p_curand_stat__ == CURAND_STATUS_ALLOCATION_FAILED)) && PetscDeviceInitialized(PETSC_DEVICE_CUDA)) { \
168:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU_RESOURCE, \
169:                   "cuRAND error %d. " \
170:                   "Reports not initialized or alloc failed; " \
171:                   "this indicates the GPU has run out resources", \
172:                   (PetscErrorCode)_p_curand_stat__); \
173:         } else { \
174:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_GPU, "cuRand error %d", (PetscErrorCode)_p_curand_stat__); \
175:         } \
176:       } \
177:     } while (0)
178:   #define CHKERRCURAND(...) PetscCallCURAND(__VA_ARGS__)

180: PETSC_EXTERN cudaStream_t   PetscDefaultCudaStream; // The default stream used by PETSc
181: PETSC_EXTERN PetscErrorCode PetscCUBLASGetHandle(cublasHandle_t *);
182: PETSC_EXTERN PetscErrorCode PetscCUSOLVERDnGetHandle(cusolverDnHandle_t *);
183: PETSC_EXTERN PetscErrorCode PetscGetCurrentCUDAStream(cudaStream_t *);

185: #endif // PETSC_HAVE_CUDA

187: // these can also be defined in petscdevice_hip.h so we undef and define them *only* if the
188: // current compiler is NVCC. In this case if petscdevice_hip.h is included first, the macros
189: // would already be defined, but they would be empty since we cannot be using HCC at the same
190: // time.
191: #if PetscDefined(USING_NVCC)
192:   #undef PETSC_HOST_DECL
193:   #undef PETSC_DEVICE_DECL
194:   #undef PETSC_KERNEL_DECL
195:   #undef PETSC_SHAREDMEM_DECL
196:   #undef PETSC_FORCEINLINE
197:   #undef PETSC_CONSTMEM_DECL

199:   #define PETSC_HOST_DECL      __host__
200:   #define PETSC_DEVICE_DECL    __device__
201:   #define PETSC_KERNEL_DECL    __global__
202:   #define PETSC_SHAREDMEM_DECL __shared__
203:   #define PETSC_FORCEINLINE    __forceinline__
204:   #define PETSC_CONSTMEM_DECL  __constant__
205: #endif

207: #if !defined(PETSC_HOST_DECL) // use HOST_DECL as canary
208:   #define PETSC_HOST_DECL
209:   #define PETSC_DEVICE_DECL
210:   #define PETSC_KERNEL_DECL
211:   #define PETSC_SHAREDMEM_DECL
212:   #define PETSC_FORCEINLINE inline
213:   #define PETSC_CONSTMEM_DECL
214: #endif

216: #if !defined(PETSC_DEVICE_DEFINED_DECLS_PRIVATE)
217:   #define PETSC_DEVICE_DEFINED_DECLS_PRIVATE
218:   #define PETSC_HOSTDEVICE_DECL        PETSC_HOST_DECL PETSC_DEVICE_DECL
219:   #define PETSC_DEVICE_INLINE_DECL     PETSC_DEVICE_DECL PETSC_FORCEINLINE
220:   #define PETSC_HOSTDEVICE_INLINE_DECL PETSC_HOSTDEVICE_DECL PETSC_FORCEINLINE
221: #endif