Actual source code: cusparsematimpl.h

  1: #pragma once

  3: #include <petscpkg_version.h>
  4: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
  5: #include <../src/sys/objects/device/impls/cupm/cupmthrustutility.hpp>
  6: #include <petsc/private/petsclegacycupmblas.h>

  8: #include <algorithm>
  9: #include <vector>

 11: #include <thrust/device_vector.h>
 12: #include <thrust/device_ptr.h>
 13: #include <thrust/device_malloc_allocator.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/sequence.h>
 17: #include <thrust/system/system_error.h>

 19: #if defined(PETSC_USE_COMPLEX)
 20:   #if defined(PETSC_USE_REAL_SINGLE)
 21: const cuComplex PETSC_CUSPARSE_ONE  = {1.0f, 0.0f};
 22: const cuComplex PETSC_CUSPARSE_ZERO = {0.0f, 0.0f};
 23:     #define cusparseXcsrilu02_bufferSize(a, b, c, d, e, f, g, h, i)  cusparseCcsrilu02_bufferSize(a, b, c, d, (cuComplex *)e, f, g, h, i)
 24:     #define cusparseXcsrilu02_analysis(a, b, c, d, e, f, g, h, i, j) cusparseCcsrilu02_analysis(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
 25:     #define cusparseXcsrilu02(a, b, c, d, e, f, g, h, i, j)          cusparseCcsrilu02(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
 26:     #define cusparseXcsric02_bufferSize(a, b, c, d, e, f, g, h, i)   cusparseCcsric02_bufferSize(a, b, c, d, (cuComplex *)e, f, g, h, i)
 27:     #define cusparseXcsric02_analysis(a, b, c, d, e, f, g, h, i, j)  cusparseCcsric02_analysis(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
 28:     #define cusparseXcsric02(a, b, c, d, e, f, g, h, i, j)           cusparseCcsric02(a, b, c, d, (cuComplex *)e, f, g, h, i, j)
 29:   #elif defined(PETSC_USE_REAL_DOUBLE)
 30: const cuDoubleComplex PETSC_CUSPARSE_ONE  = {1.0, 0.0};
 31: const cuDoubleComplex PETSC_CUSPARSE_ZERO = {0.0, 0.0};
 32:     #define cusparseXcsrilu02_bufferSize(a, b, c, d, e, f, g, h, i)  cusparseZcsrilu02_bufferSize(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i)
 33:     #define cusparseXcsrilu02_analysis(a, b, c, d, e, f, g, h, i, j) cusparseZcsrilu02_analysis(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
 34:     #define cusparseXcsrilu02(a, b, c, d, e, f, g, h, i, j)          cusparseZcsrilu02(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
 35:     #define cusparseXcsric02_bufferSize(a, b, c, d, e, f, g, h, i)   cusparseZcsric02_bufferSize(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i)
 36:     #define cusparseXcsric02_analysis(a, b, c, d, e, f, g, h, i, j)  cusparseZcsric02_analysis(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
 37:     #define cusparseXcsric02(a, b, c, d, e, f, g, h, i, j)           cusparseZcsric02(a, b, c, d, (cuDoubleComplex *)e, f, g, h, i, j)
 38:   #endif
 39: #else
 40: const PetscScalar PETSC_CUSPARSE_ONE  = 1.0;
 41: const PetscScalar PETSC_CUSPARSE_ZERO = 0.0;
 42:   #if defined(PETSC_USE_REAL_SINGLE)
 43:     #define cusparseXcsrilu02_bufferSize cusparseScsrilu02_bufferSize
 44:     #define cusparseXcsrilu02_analysis   cusparseScsrilu02_analysis
 45:     #define cusparseXcsrilu02            cusparseScsrilu02
 46:     #define cusparseXcsric02_bufferSize  cusparseScsric02_bufferSize
 47:     #define cusparseXcsric02_analysis    cusparseScsric02_analysis
 48:     #define cusparseXcsric02             cusparseScsric02
 49:   #elif defined(PETSC_USE_REAL_DOUBLE)
 50:     #define cusparseXcsrilu02_bufferSize cusparseDcsrilu02_bufferSize
 51:     #define cusparseXcsrilu02_analysis   cusparseDcsrilu02_analysis
 52:     #define cusparseXcsrilu02            cusparseDcsrilu02
 53:     #define cusparseXcsric02_bufferSize  cusparseDcsric02_bufferSize
 54:     #define cusparseXcsric02_analysis    cusparseDcsric02_analysis
 55:     #define cusparseXcsric02             cusparseDcsric02
 56:   #endif
 57: #endif

 59: #define csrsvInfo_t              csrsv2Info_t
 60: #define cusparseCreateCsrsvInfo  cusparseCreateCsrsv2Info
 61: #define cusparseDestroyCsrsvInfo cusparseDestroyCsrsv2Info
 62: #if defined(PETSC_USE_COMPLEX)
 63:   #if defined(PETSC_USE_REAL_SINGLE)
 64:     #define cusparseXcsrsv_buffsize(a, b, c, d, e, f, g, h, i, j)          cusparseCcsrsv2_bufferSize(a, b, c, d, e, (cuComplex *)(f), g, h, i, j)
 65:     #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i, j, k)       cusparseCcsrsv2_analysis(a, b, c, d, e, (const cuComplex *)(f), g, h, i, j, k)
 66:     #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusparseCcsrsv2_solve(a, b, c, d, (const cuComplex *)(e), f, (const cuComplex *)(g), h, i, j, (const cuComplex *)(k), (cuComplex *)(l), m, n)
 67:   #elif defined(PETSC_USE_REAL_DOUBLE)
 68:     #define cusparseXcsrsv_buffsize(a, b, c, d, e, f, g, h, i, j)          cusparseZcsrsv2_bufferSize(a, b, c, d, e, (cuDoubleComplex *)(f), g, h, i, j)
 69:     #define cusparseXcsrsv_analysis(a, b, c, d, e, f, g, h, i, j, k)       cusparseZcsrsv2_analysis(a, b, c, d, e, (const cuDoubleComplex *)(f), g, h, i, j, k)
 70:     #define cusparseXcsrsv_solve(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusparseZcsrsv2_solve(a, b, c, d, (const cuDoubleComplex *)(e), f, (const cuDoubleComplex *)(g), h, i, j, (const cuDoubleComplex *)(k), (cuDoubleComplex *)(l), m, n)
 71:   #endif
 72: #else /* not complex */
 73:   #if defined(PETSC_USE_REAL_SINGLE)
 74:     #define cusparseXcsrsv_buffsize cusparseScsrsv2_bufferSize
 75:     #define cusparseXcsrsv_analysis cusparseScsrsv2_analysis
 76:     #define cusparseXcsrsv_solve    cusparseScsrsv2_solve
 77:   #elif defined(PETSC_USE_REAL_DOUBLE)
 78:     #define cusparseXcsrsv_buffsize cusparseDcsrsv2_bufferSize
 79:     #define cusparseXcsrsv_analysis cusparseDcsrsv2_analysis
 80:     #define cusparseXcsrsv_solve    cusparseDcsrsv2_solve
 81:   #endif
 82: #endif

 84: #define cusparse_csr2csc cusparseCsr2cscEx2
 85: #if defined(PETSC_USE_COMPLEX)
 86:   #if defined(PETSC_USE_REAL_SINGLE)
 87:     #define cusparse_scalartype                                                             CUDA_C_32F
 88:     #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) cusparseCcsrgeam2(a, b, c, (cuComplex *)d, e, f, (cuComplex *)g, h, i, (cuComplex *)j, k, l, (cuComplex *)m, n, o, p, (cuComplex *)q, r, s, t)
 89:     #define cusparse_csr_spgeam_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
 90:       cusparseCcsrgeam2_bufferSizeExt(a, b, c, (cuComplex *)d, e, f, (cuComplex *)g, h, i, (cuComplex *)j, k, l, (cuComplex *)m, n, o, p, (cuComplex *)q, r, s, t)
 91:   #elif defined(PETSC_USE_REAL_DOUBLE)
 92:     #define cusparse_scalartype CUDA_C_64F
 93:     #define cusparse_csr_spgeam(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
 94:       cusparseZcsrgeam2(a, b, c, (cuDoubleComplex *)d, e, f, (cuDoubleComplex *)g, h, i, (cuDoubleComplex *)j, k, l, (cuDoubleComplex *)m, n, o, p, (cuDoubleComplex *)q, r, s, t)
 95:     #define cusparse_csr_spgeam_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) \
 96:       cusparseZcsrgeam2_bufferSizeExt(a, b, c, (cuDoubleComplex *)d, e, f, (cuDoubleComplex *)g, h, i, (cuDoubleComplex *)j, k, l, (cuDoubleComplex *)m, n, o, p, (cuDoubleComplex *)q, r, s, t)
 97:   #endif
 98: #else /* not complex */
 99:   #if defined(PETSC_USE_REAL_SINGLE)
100:     #define cusparse_scalartype            CUDA_R_32F
101:     #define cusparse_csr_spgeam            cusparseScsrgeam2
102:     #define cusparse_csr_spgeam_bufferSize cusparseScsrgeam2_bufferSizeExt
103:   #elif defined(PETSC_USE_REAL_DOUBLE)
104:     #define cusparse_scalartype            CUDA_R_64F
105:     #define cusparse_csr_spgeam            cusparseDcsrgeam2
106:     #define cusparse_csr_spgeam_bufferSize cusparseDcsrgeam2_bufferSizeExt
107:   #endif
108: #endif

110: #define THRUSTINTARRAY32 thrust::device_vector<int>
111: #define THRUSTINTARRAY   thrust::device_vector<PetscInt>
112: #define THRUSTARRAY      thrust::device_vector<PetscScalar>

114: /* A CSR matrix nonzero structure */
115: struct CsrMatrix {
116:   PetscInt          num_rows;
117:   PetscInt          num_cols;
118:   PetscInt          num_entries;
119:   THRUSTINTARRAY32 *row_offsets;
120:   THRUSTINTARRAY32 *column_indices;
121:   THRUSTARRAY      *values;
122: };

124: /* This is struct holding the relevant data needed to a MatSolve */
125: struct Mat_SeqAIJCUSPARSETriFactorStruct {
126:   /* Data needed for triangular solve */
127:   cusparseMatDescr_t  descr;
128:   cusparseOperation_t solveOp;
129:   CsrMatrix          *csrMat;
130:   int                 solveBufferSize;
131:   void               *solveBuffer;
132:   size_t              csr2cscBufferSize; /* to transpose the triangular factor (only used for CUDA >= 11.0) */
133:   void               *csr2cscBuffer;
134:   PetscScalar        *AA_h; /* managed host buffer for moving values to the GPU */
135: };

137: /* This is a larger struct holding all the triangular factors for a solve, transpose solve, and any indices used in a reordering */
138: struct Mat_SeqAIJCUSPARSETriFactors {
139:   THRUSTINTARRAY  *rpermIndices; /* indices used for any reordering */
140:   THRUSTINTARRAY  *cpermIndices; /* indices used for any reordering */
141:   THRUSTARRAY     *workVector;
142:   cusparseHandle_t handle; /* a handle to the cusparse library */
143:   PetscInt         nnz;    /* number of nonzeros ... need this for accurate logging between ICC and ILU */
144:   cudaDeviceProp   dev_prop;
145:   PetscBool        init_dev_prop;

147:   PetscScalar *csrVal, *diag;             // the diagonal D in UtDU of Cholesky
148:   int         *csrRowPtr32, *csrColIdx32; // i,j of M. cusparseScsrilu02/ic02() etc require 32-bit indices
149:   PetscInt    *csrRowPtr, *csrColIdx;     // i, j of M on device for CUDA APIs that support 64-bit indices
150:   PetscScalar *csrVal_h, *diag_h;         // Since LU is done on host, we prepare a factored matrix in regular csr format on host and then copy it to device
151:   PetscInt    *csrRowPtr_h;               // csrColIdx_h is temporary, so it is not here

153:   /* Mixed mat descriptor types? yes, different cusparse APIs use different types */
154:   cusparseMatDescr_t   matDescr_M;
155:   cusparseSpMatDescr_t spMatDescr_L, spMatDescr_U;
156:   cusparseSpSVDescr_t  spsvDescr_L, spsvDescr_Lt, spsvDescr_U, spsvDescr_Ut;
157:   cusparseDnVecDescr_t dnVecDescr_X, dnVecDescr_Y;
158:   PetscScalar         *X, *Y; /* data array of dnVec X and Y */

160:   /* Mixed size types? yes, CUDA-11.7.0 declared cusparseDcsrilu02_bufferSizeExt() that returns size_t but did not implement it! */
161:   int    factBufferSize_M; /* M ~= LU or LLt */
162:   size_t spsvBufferSize_L, spsvBufferSize_Lt, spsvBufferSize_U, spsvBufferSize_Ut;
163:   /* cusparse needs various buffers for factorization and solve of L, U, Lt, or Ut.
164:      So save memory, we share the factorization buffer with one of spsvBuffer_L/U.
165:   */
166:   void *factBuffer_M, *spsvBuffer_L, *spsvBuffer_U, *spsvBuffer_Lt, *spsvBuffer_Ut;

168:   csrilu02Info_t        ilu0Info_M;
169:   csric02Info_t         ic0Info_M;
170:   int                   structural_zero, numerical_zero;
171:   cusparseSolvePolicy_t policy_M;

173:   /* In MatSolveTranspose() for ILU0, we use the two flags to do on-demand solve */
174:   PetscBool createdTransposeSpSVDescr;    /* Have we created SpSV descriptors for Lt, Ut? */
175:   PetscBool updatedTransposeSpSVAnalysis; /* Have we ever updated (done) SpSV analysis for Lt, Ut */
176:   PetscBool updatedSpSVAnalysis;          /* Have we ever updated (done) SpSV Analysis for L, U? */

178:   PetscLogDouble numericFactFlops; /* Estimated FLOPs in ILU0/ICC0 numeric factorization */
179: };

181: struct Mat_CusparseSpMV {
182:   PetscBool            initialized;    /* Don't rely on spmvBuffer != NULL to test if the struct is initialized, */
183:   size_t               spmvBufferSize; /* since I'm not sure if smvBuffer can be NULL even after cusparseSpMV_bufferSize() */
184:   void                *spmvBuffer;
185:   cusparseDnVecDescr_t vecXDescr, vecYDescr; /* descriptor for the dense vectors in y=op(A)x */
186: };

188: /* This is struct holding the relevant data needed to a MatMult */
189: struct Mat_SeqAIJCUSPARSEMultStruct {
190:   void                *mat;          /* opaque pointer to a matrix. This could be either a cusparseHybMat_t or a CsrMatrix */
191:   cusparseMatDescr_t   descr;        /* Data needed to describe the matrix for a multiply */
192:   THRUSTINTARRAY      *cprowIndices; /* compressed row indices used in the parallel SpMV */
193:   PetscScalar         *alpha_one;    /* pointer to a device "scalar" storing the alpha parameter in the SpMV */
194:   PetscScalar         *beta_zero;    /* pointer to a device "scalar" storing the beta parameter in the SpMV as zero*/
195:   PetscScalar         *beta_one;     /* pointer to a device "scalar" storing the beta parameter in the SpMV as one */
196:   cusparseSpMatDescr_t matDescr;     /* descriptor for the matrix, used by SpMV and SpMM */
197: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
198:   cusparseSpMatDescr_t matDescr_SpMV[3]; // Use separate MatDescr for opA's, to workaround cusparse bugs after 12.4, see https://github.com/NVIDIA/CUDALibrarySamples/issues/212,
199:   cusparseSpMatDescr_t matDescr_SpMM[3]; // and known issues https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-release-12-6
200: #endif
201:   Mat_CusparseSpMV cuSpMV[3]; /* different Mat_CusparseSpMV structs for non-transpose, transpose, conj-transpose */
202:   Mat_SeqAIJCUSPARSEMultStruct() : matDescr(NULL)
203:   {
204:     for (int i = 0; i < 3; i++) {
205:       cuSpMV[i].initialized = PETSC_FALSE;
206: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
207:       matDescr_SpMV[i] = NULL;
208:       matDescr_SpMM[i] = NULL;
209: #endif
210:     }
211:   }
212: };

214: /* This is a larger struct holding all the matrices for a SpMV, and SpMV Transpose */
215: struct Mat_SeqAIJCUSPARSE {
216:   Mat_SeqAIJCUSPARSEMultStruct *mat;               /* pointer to the matrix on the GPU */
217:   Mat_SeqAIJCUSPARSEMultStruct *matTranspose;      /* pointer to the matrix on the GPU (for the transpose ... useful for BiCG) */
218:   THRUSTARRAY                  *workVector;        /* pointer to a workvector to which we can copy the relevant indices of a vector we want to multiply */
219:   THRUSTINTARRAY32             *rowoffsets_gpu;    /* rowoffsets on GPU in non-compressed-row format. It is used to convert CSR to CSC */
220:   PetscInt                      nrows;             /* number of rows of the matrix seen by GPU */
221:   MatCUSPARSEStorageFormat      format;            /* the storage format for the matrix on the device */
222:   PetscBool                     use_cpu_solve;     /* Use AIJ_Seq (I)LU solve */
223:   cudaStream_t                  stream;            /* a stream for the parallel SpMV ... this is not owned and should not be deleted */
224:   cusparseHandle_t              handle;            /* a handle to the cusparse library ... this may not be owned (if we're working in parallel i.e. multiGPUs) */
225:   PetscObjectState              nonzerostate;      /* track nonzero state to possibly recreate the GPU matrix */
226:   size_t                        csr2cscBufferSize; /* stuff used to compute the matTranspose above */
227:   void                         *csr2cscBuffer;     /* This is used as a C struct and is calloc'ed by PetscNew() */
228:   cusparseCsr2CscAlg_t          csr2cscAlg;        /* algorithms can be selected from command line options */
229:   cusparseSpMVAlg_t             spmvAlg;
230:   cusparseSpMMAlg_t             spmmAlg;
231:   THRUSTINTARRAY               *csr2csc_i;
232:   THRUSTINTARRAY               *coords; /* permutation array used in MatSeqAIJCUSPARSEMergeMats */
233: };

235: typedef struct Mat_SeqAIJCUSPARSETriFactors *Mat_SeqAIJCUSPARSETriFactors_p;

237: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
238: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat, Mat, MatReuse, Mat *);
239: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *);

241: using VecSeq_CUDA = Petsc::vec::cupm::impl::VecSeq_CUPM<Petsc::device::cupm::DeviceType::CUDA>;

243: static inline bool isCudaMem(const void *data)
244: {
245:   using namespace Petsc::device::cupm;
246:   auto mtype = PETSC_MEMTYPE_HOST;

248:   PetscFunctionBegin;
249:   PetscCallAbort(PETSC_COMM_SELF, impl::Interface<DeviceType::CUDA>::PetscCUPMGetMemType(data, &mtype));
250:   PetscFunctionReturn(PetscMemTypeDevice(mtype));
251: }