Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
 15: #include <thrust/adjacent_difference.h>
 16: #if PETSC_CPP_VERSION >= 14
 17:   #define PETSC_HAVE_THRUST_ASYNC 1
 18: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 19: #endif
 20: #include <thrust/iterator/constant_iterator.h>
 21: #include <thrust/remove.h>
 22: #include <thrust/sort.h>
 23: #include <thrust/tuple.h>
 24: #include <thrust/unique.h>
 25: #include <thrust/gather.h>
 26: #include <thrust/binary_search.h> // for thrust::lower_bound
 27: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0)
 28:   #include <cuda/std/functional>
 29: #endif

 31: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 32: /*
 33:   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 34:   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
 35: */
 36: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 37: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 38: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};

 40: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 41: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 42: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 43: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 44: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 45: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 46: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 47: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
 48: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 49: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 50: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 51: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 52: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 53: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 54: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 56: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 57: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 58: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 59: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 61: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 62: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 64: static PetscErrorCode       MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 65: static PetscErrorCode       MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 66: static PetscErrorCode       MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
 67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);

 69: // cusparseCreateCsr() separates types for row offsets and column indices in prototype, but requires them to have the same type at runtime!
 70: const cusparseIndexType_t csrRowOffsetsType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
 71: const cusparseIndexType_t csrColIndType     = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

 73: using Csr2coo        = Petsc::mat::aij::cupm::impl::Csr2coo;
 74: using PetscIntToCInt = Petsc::mat::aij::cupm::impl::PetscIntToCInt;

 76: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 77: {
 78:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

 80:   PetscFunctionBegin;
 81:   switch (op) {
 82:   case MAT_CUSPARSE_MULT:
 83:     cusparsestruct->format = format;
 84:     break;
 85:   case MAT_CUSPARSE_ALL:
 86:     cusparsestruct->format = format;
 87:     break;
 88:   default:
 89:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
 90:   }
 91:   PetscFunctionReturn(PETSC_SUCCESS);
 92: }

 94: /*@
 95:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
 96:   operation. Only the `MatMult()` operation can use different GPU storage formats

 98:   Not Collective

100:   Input Parameters:
101: + A      - Matrix of type `MATSEQAIJCUSPARSE`
102: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
103:            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
104: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

106:   Level: intermediate

108: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
109: @*/
110: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
111: {
112:   PetscFunctionBegin;
114:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
115:   PetscFunctionReturn(PETSC_SUCCESS);
116: }

118: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
119: {
120:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

122:   PetscFunctionBegin;
123:   cusparsestruct->use_cpu_solve = use_cpu;
124:   PetscFunctionReturn(PETSC_SUCCESS);
125: }

127: /*@
128:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

130:   Input Parameters:
131: + A       - Matrix of type `MATSEQAIJCUSPARSE`
132: - use_cpu - set flag for using the built-in CPU `MatSolve()`

134:   Level: intermediate

136:   Note:
137:   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
138:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
139:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

141: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
144: {
145:   PetscFunctionBegin;
147:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
148:   PetscFunctionReturn(PETSC_SUCCESS);
149: }

151: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
152: {
153:   PetscFunctionBegin;
154:   switch (op) {
155:   case MAT_FORM_EXPLICIT_TRANSPOSE:
156:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
157:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
158:     A->form_explicit_transpose = flg;
159:     break;
160:   default:
161:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
162:     break;
163:   }
164:   PetscFunctionReturn(PETSC_SUCCESS);
165: }

167: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
168: {
169:   MatCUSPARSEStorageFormat format;
170:   PetscBool                flg;
171:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

173:   PetscFunctionBegin;
174:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
175:   if (A->factortype == MAT_FACTOR_NONE) {
176:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
177:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

179:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
180:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
181:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
182:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
183:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
184:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
185:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
187:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
188:     PetscCall(
189:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
190:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
191:   }
192:   PetscOptionsHeadEnd();
193:   PetscFunctionReturn(PETSC_SUCCESS);
194: }

196: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
197: {
198:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
199:   PetscInt                      m  = A->rmap->n;
200:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
201:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
202:   const MatScalar              *Aa = a->a;
203:   PetscInt                     *Mi, *Mj, Mnz;
204:   PetscScalar                  *Ma;

206:   PetscFunctionBegin;
207:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
208:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
209:     if (!fs->csrRowPtr) {                    // Is this the first time we are doing setup? Use csrRowPtr since it is not null even when m=0
210:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
211:       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
212:       PetscCall(PetscMalloc1(m + 1, &Mi));
213:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
214:       PetscCall(PetscMalloc1(Mnz, &Ma));
215:       Mi[0] = 0;
216:       for (PetscInt i = 0; i < m; i++) {
217:         PetscInt llen = Ai[i + 1] - Ai[i];
218:         PetscInt ulen = adiag[i] - adiag[i + 1];
219:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
220:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
221:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
222:         Mi[i + 1] = Mi[i] + llen + ulen;
223:       }
224:       // Copy M (L,U) from host to device
225:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
226:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
227:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
228:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
229:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

231:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
232:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
233:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
234:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
235:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
236:       cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
237:       cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;

239:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
240:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
241:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

243:       fillMode = CUSPARSE_FILL_MODE_UPPER;
244:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
245:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
246:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
247:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

249:       // Allocate work vectors in SpSv
250:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
251:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

253:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
254:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

256:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
257:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
258:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
259:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
260:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
261:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
262:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

264:       // Record for reuse
265:       fs->csrRowPtr_h = Mi;
266:       fs->csrVal_h    = Ma;
267:       PetscCall(PetscFree(Mj));
268:     }
269:     // Copy the value
270:     Mi  = fs->csrRowPtr_h;
271:     Ma  = fs->csrVal_h;
272:     Mnz = Mi[m];
273:     for (PetscInt i = 0; i < m; i++) {
274:       PetscInt llen = Ai[i + 1] - Ai[i];
275:       PetscInt ulen = adiag[i] - adiag[i + 1];
276:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
277:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
278:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
279:     }
280:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

282: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
283:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
284:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
285:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
286:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
287:     } else
288: #endif
289:     {
290:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
291:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

293:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
294:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
295:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
296:     }
297:   }
298:   PetscFunctionReturn(PETSC_SUCCESS);
299: }

301: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
302: {
303:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
304:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
305:   IS                            isrow = a->row, isicol = a->icol;
306:   PetscBool                     row_identity, col_identity;
307:   PetscInt                      n = A->rmap->n;

309:   PetscFunctionBegin;
310:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
311:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));

313:   cusparseTriFactors->nnz = a->nz;

315:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
316:   /* lower triangular indices */
317:   PetscCall(ISIdentity(isrow, &row_identity));
318:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
319:     const PetscInt *r;

321:     PetscCall(ISGetIndices(isrow, &r));
322:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
323:     cusparseTriFactors->rpermIndices->assign(r, r + n);
324:     PetscCall(ISRestoreIndices(isrow, &r));
325:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
326:   }

328:   /* upper triangular indices */
329:   PetscCall(ISIdentity(isicol, &col_identity));
330:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
331:     const PetscInt *c;

333:     PetscCall(ISGetIndices(isicol, &c));
334:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
335:     cusparseTriFactors->cpermIndices->assign(c, c + n);
336:     PetscCall(ISRestoreIndices(isicol, &c));
337:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
338:   }
339:   PetscFunctionReturn(PETSC_SUCCESS);
340: }

342: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
343: {
344:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
345:   PetscInt                      m  = A->rmap->n;
346:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
347:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
348:   const MatScalar              *Aa = a->a;
349:   PetscInt                     *Mj, Mnz;
350:   PetscScalar                  *Ma, *D;

352:   PetscFunctionBegin;
353:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
354:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
355:     if (!fs->csrRowPtr) {                    // Is this the first time we are doing setup? Use csrRowPtr since it is not null even m=0
356:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
357:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
358:       Mnz = Ai[m]; // Unz (with the unit diagonal)
359:       PetscCall(PetscMalloc1(Mnz, &Ma));
360:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
361:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
362:       for (PetscInt i = 0; i < m; i++) {
363:         PetscInt ulen = Ai[i + 1] - Ai[i];
364:         Mj[Ai[i]]     = i;                                              // diagonal entry
365:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
366:       }
367:       // Copy M (U) from host to device
368:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
369:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
370:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
371:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
372:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
373:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

375:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
376:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
377:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
378:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
379:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
380:       cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
381:       cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal

383:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
384:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
385:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

387:       // Allocate work vectors in SpSv
388:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
389:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

391:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
392:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

394:       // Query buffer sizes for SpSV and then allocate buffers
395:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
396:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
397:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

399:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
400:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
401:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

403:       // Record for reuse
404:       fs->csrVal_h = Ma;
405:       fs->diag_h   = D;
406:       PetscCall(PetscFree(Mj));
407:     }
408:     // Copy the value
409:     Ma  = fs->csrVal_h;
410:     D   = fs->diag_h;
411:     Mnz = Ai[m];
412:     for (PetscInt i = 0; i < m; i++) {
413:       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
414:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
415:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
416:     }
417:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
418:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

420: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
421:     if (fs->updatedSpSVAnalysis) {
422:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
423:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
424:     } else
425: #endif
426:     {
427:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
428:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
429:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
430:       fs->updatedSpSVAnalysis = PETSC_TRUE;
431:     }
432:   }
433:   PetscFunctionReturn(PETSC_SUCCESS);
434: }

436: // Solve Ut D U x = b
437: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
438: {
439:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
440:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
441:   const PetscScalar                    *barray;
442:   PetscScalar                          *xarray;
443:   thrust::device_ptr<const PetscScalar> bGPU;
444:   thrust::device_ptr<PetscScalar>       xGPU;
445:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
446:   PetscInt                              m   = A->rmap->n;

448:   PetscFunctionBegin;
449:   PetscCall(PetscLogGpuTimeBegin());
450:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
451:   PetscCall(VecCUDAGetArrayRead(b, &barray));
452:   xGPU = thrust::device_pointer_cast(xarray);
453:   bGPU = thrust::device_pointer_cast(barray);

455:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
456:   if (fs->rpermIndices) {
457:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
458:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
459:   } else {
460:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
461:   }

463:   // Solve Ut Y = X
464:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
465:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

467:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
468:   // It is basically a vector element-wise multiplication, but cublas does not have it!
469: #if CCCL_VERSION >= 3001000
470:   auto multiplies = cuda::std::multiplies<PetscScalar>();
471: #else
472:   auto multiplies = thrust::multiplies<PetscScalar>();
473: #endif
474:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));

476:   // Solve U X = Y
477:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
478:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
479:   } else {
480:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
481:   }
482:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

484:   // Reorder X with the column permutation if needed, and put the result back to x
485:   if (fs->cpermIndices) {
486:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
487:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
488:   }

490:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
491:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
492:   PetscCall(PetscLogGpuTimeEnd());
493:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
494:   PetscFunctionReturn(PETSC_SUCCESS);
495: }

497: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498: {
499:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
500:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
501:   IS                            ip                 = a->row;
502:   PetscBool                     perm_identity;
503:   PetscInt                      n = A->rmap->n;

505:   PetscFunctionBegin;
506:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

508:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
509:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

511:   A->offloadmask = PETSC_OFFLOAD_BOTH;

513:   /* lower triangular indices */
514:   PetscCall(ISIdentity(ip, &perm_identity));
515:   if (!perm_identity) {
516:     IS              iip;
517:     const PetscInt *irip, *rip;

519:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
520:     PetscCall(ISGetIndices(iip, &irip));
521:     PetscCall(ISGetIndices(ip, &rip));
522:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
523:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
524:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
525:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
526:     PetscCall(ISRestoreIndices(iip, &irip));
527:     PetscCall(ISDestroy(&iip));
528:     PetscCall(ISRestoreIndices(ip, &rip));
529:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
530:   }
531:   PetscFunctionReturn(PETSC_SUCCESS);
532: }

534: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
535: {
536:   PetscFunctionBegin;
537:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
538:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
539:   B->offloadmask            = PETSC_OFFLOAD_CPU;
540:   B->ops->solve             = MatSolve_SeqAIJCUSPARSE_Cholesky;
541:   B->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_Cholesky; // since symmetric
542:   B->ops->matsolve          = NULL;
543:   B->ops->matsolvetranspose = NULL;
544:   /* get the triangular factors */
545:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
546:   PetscFunctionReturn(PETSC_SUCCESS);
547: }

549: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
550: {
551:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
552:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
553:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
554:   cusparseIndexBase_t           indexBase;

556:   PetscFunctionBegin;
557:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
558:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
559:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
560:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
561:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
562:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
563:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
564:   PetscCall(PetscLogGpuTimeBegin());
565:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
566:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
567:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
568:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
569:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
570:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
571:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

573:     /* set alpha and beta */
574:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
575:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
576:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
577:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
578:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
579:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

581:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
582:       CsrMatrix *matrixT      = new CsrMatrix;
583:       matstructT->mat         = matrixT;
584:       matrixT->num_rows       = A->cmap->n;
585:       matrixT->num_cols       = A->rmap->n;
586:       matrixT->num_entries    = a->nz;
587:       matrixT->row_offsets    = new THRUSTINTARRAY(matrixT->num_rows + 1);
588:       matrixT->column_indices = new THRUSTINTARRAY(a->nz);
589:       matrixT->values         = new THRUSTARRAY(a->nz);

591:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
592:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
593:       PetscCallCUSPARSE(cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), csrRowOffsetsType, csrColIndType, indexBase, cusparse_scalartype));
594:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
595:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
596:     }
597:   }
598:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
599:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
600:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
601:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
602:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
603:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
604:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
605:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
606:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
607:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
608:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
609:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
610:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
611:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
612:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
613:     }
614:     if (!cusparsestruct->csr2csc_i) { // not using cusparseCsr2cscEx2() because it requires 32-bit indices
615:       THRUSTINTARRAY row_indices(matrix->num_entries);

617:       // Transpose the matrix via COO, i.e., by putting the row indices in column_indices[] and the column indices in row_indices[]
618:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); // will store the matrix to matrixT permutation, i.e., entry matrixT[i] is matrix[csr2csc_i[i]]
619:       PetscCallThrust(thrust::sequence(thrust::device, cusparsestruct->csr2csc_i->begin(), cusparsestruct->csr2csc_i->end()));
620:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->rmap->n), Csr2coo(cusparsestruct->rowoffsets_gpu->data().get(), matrixT->column_indices->data().get())));
621:       row_indices = *matrix->column_indices;
622:       // Sort the COO by row then column, and get the permutation csr2csc_i[]
623:       PetscCallThrust(thrust::sort_by_key(thrust::device, thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), matrixT->column_indices->begin())), thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), matrixT->column_indices->end())),
624:                                           cusparsestruct->csr2csc_i->begin()));
625:       // Finalize matrixT's row_offsets by looking up row_indices[]
626:       PetscCallThrust(thrust::lower_bound(thrust::device, row_indices.begin(), row_indices.end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->cmap->n + 1), matrixT->row_offsets->begin()));
627:     }
628:     PetscCallThrust(thrust::gather(thrust::device, cusparsestruct->csr2csc_i->begin(), cusparsestruct->csr2csc_i->end(), matrix->values->begin(), matrixT->values->begin()));
629:   }
630:   PetscCall(PetscLogGpuTimeEnd());
631:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
632:   /* the compressed row indices is not used for matTranspose */
633:   matstructT->cprowIndices = NULL;
634:   /* assign the pointer */
635:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
636:   A->transupdated                                = PETSC_TRUE;
637:   PetscFunctionReturn(PETSC_SUCCESS);
638: }

640: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
641: {
642:   const PetscScalar                    *barray;
643:   PetscScalar                          *xarray;
644:   thrust::device_ptr<const PetscScalar> bGPU;
645:   thrust::device_ptr<PetscScalar>       xGPU;
646:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
647:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
648:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
649:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
650:   PetscInt                              m   = A->rmap->n;

652:   PetscFunctionBegin;
653:   PetscCall(PetscLogGpuTimeBegin());
654:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
655:   PetscCall(VecCUDAGetArrayRead(b, &barray));
656:   xGPU = thrust::device_pointer_cast(xarray);
657:   bGPU = thrust::device_pointer_cast(barray);

659:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
660:   if (fs->rpermIndices) {
661:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
662:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
663:   } else {
664:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
665:   }

667:   // Solve L Y = X
668:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
669:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
670:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

672:   // Solve U X = Y
673:   if (fs->cpermIndices) {
674:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
675:   } else {
676:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
677:   }
678:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

680:   // Reorder X with the column permutation if needed, and put the result back to x
681:   if (fs->cpermIndices) {
682:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
683:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
684:   }
685:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
686:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
687:   PetscCall(PetscLogGpuTimeEnd());
688:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
689:   PetscFunctionReturn(PETSC_SUCCESS);
690: }

692: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
693: {
694:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
695:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
696:   const PetscScalar                    *barray;
697:   PetscScalar                          *xarray;
698:   thrust::device_ptr<const PetscScalar> bGPU;
699:   thrust::device_ptr<PetscScalar>       xGPU;
700:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
701:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
702:   PetscInt                              m   = A->rmap->n;

704:   PetscFunctionBegin;
705:   PetscCall(PetscLogGpuTimeBegin());
706:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
707:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
708:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
709:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

711:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
712:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
713:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
714:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
715:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
716:   }

718:   if (!fs->updatedTransposeSpSVAnalysis) {
719:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

721:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
722:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
723:   }

725:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
726:   PetscCall(VecCUDAGetArrayRead(b, &barray));
727:   xGPU = thrust::device_pointer_cast(xarray);
728:   bGPU = thrust::device_pointer_cast(barray);

730:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
731:   if (fs->rpermIndices) {
732:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
733:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
734:   } else {
735:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
736:   }

738:   // Solve Ut Y = X
739:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
740:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

742:   // Solve Lt X = Y
743:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
744:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
745:   } else {
746:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
747:   }
748:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

750:   // Reorder X with the column permutation if needed, and put the result back to x
751:   if (fs->cpermIndices) {
752:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
753:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
754:   }

756:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
757:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
758:   PetscCall(PetscLogGpuTimeEnd());
759:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
760:   PetscFunctionReturn(PETSC_SUCCESS);
761: }

763: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
764: {
765:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
766:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
767:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
768:   CsrMatrix                    *Acsr;
769:   PetscInt                      m, nz;
770:   PetscBool                     flg;

772:   PetscFunctionBegin;
773:   if (PetscDefined(USE_DEBUG)) {
774:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
775:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
776:   }

778:   /* Copy A's value to fact */
779:   m  = fact->rmap->n;
780:   nz = aij->nz;
781:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
782:   Acsr = (CsrMatrix *)Acusp->mat->mat;
783:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

785:   PetscCall(PetscLogGpuTimeBegin());
786:   /* Factorize fact inplace */
787:   if (m)
788:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
789:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
790:   if (PetscDefined(USE_DEBUG)) {
791:     int              numerical_zero;
792:     cusparseStatus_t status;
793:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
794:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
795:   }

797: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
798:   if (fs->updatedSpSVAnalysis) {
799:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
800:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
801:   } else
802: #endif
803:   {
804:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
805:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
806:     */
807:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

809:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

811:     fs->updatedSpSVAnalysis = PETSC_TRUE;
812:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
813:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
814:   }

816:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
817:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
818:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
819:   fact->ops->matsolve          = NULL;
820:   fact->ops->matsolvetranspose = NULL;
821:   PetscCall(PetscLogGpuTimeEnd());
822:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
823:   PetscFunctionReturn(PETSC_SUCCESS);
824: }

826: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
827: {
828:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
829:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
830:   PetscInt                      m, nz;

832:   PetscFunctionBegin;
833:   if (PetscDefined(USE_DEBUG)) {
834:     PetscBool flg, diagDense;

836:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
837:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
838:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
839:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
840:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
841:   }

843:   /* Free the old stale stuff */
844:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

846:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
847:      but they will not be used. Allocate them just for easy debugging.
848:    */
849:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

851:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
852:   fact->factortype             = MAT_FACTOR_ILU;
853:   fact->info.factor_mallocs    = 0;
854:   fact->info.fill_ratio_given  = info->fill;
855:   fact->info.fill_ratio_needed = 1.0;

857:   aij->row = NULL;
858:   aij->col = NULL;

860:   /* ====================================================================== */
861:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
862:   /* We'll do in-place factorization on fact                                */
863:   /* ====================================================================== */
864:   const PetscInt *Ai, *Aj;

866:   m  = fact->rmap->n;
867:   nz = aij->nz;

869:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
870:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
871:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
872:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed

874:   PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
875:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
876:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));

878:   /* ====================================================================== */
879:   /* Create descriptors for M, L, U                                         */
880:   /* ====================================================================== */
881:   cusparseFillMode_t fillMode;
882:   cusparseDiagType_t diagType;

884:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
885:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
886:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

888:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
889:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
890:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
891:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
892:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
893:   */
894:   fillMode = CUSPARSE_FILL_MODE_LOWER;
895:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
896:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
897:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
898:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

900:   fillMode = CUSPARSE_FILL_MODE_UPPER;
901:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
902:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
903:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
904:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

906:   /* ========================================================================= */
907:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
908:   /* ========================================================================= */
909:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
910:   if (m)
911:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
912:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

914:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
915:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

917:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
918:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

920:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
921:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

923:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
924:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

926:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
927:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
928:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
929:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
930:    */
931:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
932:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
933:     fs->spsvBuffer_L = fs->factBuffer_M;
934:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
935:   } else {
936:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
937:     fs->spsvBuffer_U = fs->factBuffer_M;
938:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
939:   }

941:   /* ========================================================================== */
942:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
943:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
944:   /* ========================================================================== */
945:   int              structural_zero;
946:   cusparseStatus_t status;

948:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
949:   if (m)
950:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
951:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
952:   if (PetscDefined(USE_DEBUG)) {
953:     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
954:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
955:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
956:   }

958:   /* Estimate FLOPs of the numeric factorization */
959:   {
960:     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
961:     PetscInt       *Ai, nzRow, nzLeft;
962:     const PetscInt *adiag;
963:     PetscLogDouble  flops = 0.0;

965:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
966:     Ai = Aseq->i;
967:     for (PetscInt i = 0; i < m; i++) {
968:       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
969:         nzRow  = Ai[i + 1] - Ai[i];
970:         nzLeft = adiag[i] - Ai[i];
971:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
972:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
973:         */
974:         nzLeft = (nzRow - 1) / 2;
975:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
976:       }
977:     }
978:     fs->numericFactFlops = flops;
979:   }
980:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
981:   PetscFunctionReturn(PETSC_SUCCESS);
982: }

984: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
985: {
986:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
987:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
988:   const PetscScalar            *barray;
989:   PetscScalar                  *xarray;

991:   PetscFunctionBegin;
992:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
993:   PetscCall(VecCUDAGetArrayRead(b, &barray));
994:   PetscCall(PetscLogGpuTimeBegin());

996:   /* Solve L*y = b */
997:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
998:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
999:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1000:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1002:   /* Solve Lt*x = y */
1003:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1004:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1005:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1007:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1008:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1010:   PetscCall(PetscLogGpuTimeEnd());
1011:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1012:   PetscFunctionReturn(PETSC_SUCCESS);
1013: }

1015: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1016: {
1017:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1018:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1019:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1020:   CsrMatrix                    *Acsr;
1021:   PetscInt                      m, nz;
1022:   PetscBool                     flg;

1024:   PetscFunctionBegin;
1025:   if (PetscDefined(USE_DEBUG)) {
1026:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1027:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1028:   }

1030:   /* Copy A's value to fact */
1031:   m  = fact->rmap->n;
1032:   nz = aij->nz;
1033:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1034:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1035:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1037:   /* Factorize fact inplace */
1038:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1039:      csric02() only takes the lower triangular part of matrix A to perform factorization.
1040:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1041:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1042:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1043:    */
1044:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1045:   if (PetscDefined(USE_DEBUG)) {
1046:     int              numerical_zero;
1047:     cusparseStatus_t status;
1048:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1049:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1050:   }

1052: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1053:   if (fs->updatedSpSVAnalysis) {
1054:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1055:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1056:   } else
1057: #endif
1058:   {
1059:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1061:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1062:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1063:   */
1064:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1065:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1066:   }

1068:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1069:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1070:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1071:   fact->ops->matsolve          = NULL;
1072:   fact->ops->matsolvetranspose = NULL;
1073:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1074:   PetscFunctionReturn(PETSC_SUCCESS);
1075: }

1077: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1078: {
1079:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1080:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1081:   PetscInt                      m, nz;

1083:   PetscFunctionBegin;
1084:   if (PetscDefined(USE_DEBUG)) {
1085:     PetscBool flg, diagDense;

1087:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1088:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1089:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1090:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1091:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1092:   }

1094:   /* Free the old stale stuff */
1095:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1097:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1098:      but they will not be used. Allocate them just for easy debugging.
1099:    */
1100:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1102:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1103:   fact->factortype             = MAT_FACTOR_ICC;
1104:   fact->info.factor_mallocs    = 0;
1105:   fact->info.fill_ratio_given  = info->fill;
1106:   fact->info.fill_ratio_needed = 1.0;

1108:   aij->row = NULL;
1109:   aij->col = NULL;

1111:   /* ====================================================================== */
1112:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1113:   /* We'll do in-place factorization on fact                                */
1114:   /* ====================================================================== */
1115:   const PetscInt *Ai, *Aj;

1117:   m  = fact->rmap->n;
1118:   nz = aij->nz;

1120:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1121:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1122:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1123:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed

1125:   PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
1126:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
1127:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));

1129:   /* ====================================================================== */
1130:   /* Create mat descriptors for M, L                                        */
1131:   /* ====================================================================== */
1132:   cusparseFillMode_t fillMode;
1133:   cusparseDiagType_t diagType;

1135:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1136:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1137:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1139:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1140:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1141:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1142:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1143:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1144:   */
1145:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1146:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1147:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1148:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1149:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1151:   /* ========================================================================= */
1152:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1153:   /* ========================================================================= */
1154:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1155:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

1157:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1158:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1160:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1161:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1163:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1164:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1166:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1167:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1169:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1170:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1171:    */
1172:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1173:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1174:     fs->spsvBuffer_L = fs->factBuffer_M;
1175:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1176:   } else {
1177:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1178:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1179:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1180:   }

1182:   /* ========================================================================== */
1183:   /* Perform analysis of ic0 on M                                               */
1184:   /* The lower triangular part of M has the same sparsity pattern as L          */
1185:   /* ========================================================================== */
1186:   int              structural_zero;
1187:   cusparseStatus_t status;

1189:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1190:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1191:   if (PetscDefined(USE_DEBUG)) {
1192:     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1193:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1194:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1195:   }

1197:   /* Estimate FLOPs of the numeric factorization */
1198:   {
1199:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1200:     PetscInt      *Ai, nzRow, nzLeft;
1201:     PetscLogDouble flops = 0.0;

1203:     Ai = Aseq->i;
1204:     for (PetscInt i = 0; i < m; i++) {
1205:       nzRow = Ai[i + 1] - Ai[i];
1206:       if (nzRow > 1) {
1207:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1208:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1209:         */
1210:         nzLeft = (nzRow - 1) / 2;
1211:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1212:       }
1213:     }
1214:     fs->numericFactFlops = flops;
1215:   }
1216:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1217:   PetscFunctionReturn(PETSC_SUCCESS);
1218: }

1220: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1221: {
1222:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
1223:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

1225:   PetscFunctionBegin;
1226:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1227:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1228:   B->offloadmask = PETSC_OFFLOAD_CPU;

1230:   if (!cusparsestruct->use_cpu_solve) {
1231:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
1232:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1233:   }
1234:   B->ops->matsolve          = NULL;
1235:   B->ops->matsolvetranspose = NULL;

1237:   /* get the triangular factors */
1238:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
1239:   PetscFunctionReturn(PETSC_SUCCESS);
1240: }

1242: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1243: {
1244:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

1246:   PetscFunctionBegin;
1247:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1248:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1249:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1250:   PetscFunctionReturn(PETSC_SUCCESS);
1251: }

1253: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1254: {
1255:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1257:   PetscFunctionBegin;
1258:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1259:   if (!info->factoronhost) {
1260:     PetscCall(ISIdentity(isrow, &row_identity));
1261:     PetscCall(ISIdentity(iscol, &col_identity));
1262:   }
1263:   if (!info->levels && row_identity && col_identity) {
1264:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1265:   } else {
1266:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1267:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1268:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1269:   }
1270:   PetscFunctionReturn(PETSC_SUCCESS);
1271: }

1273: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1274: {
1275:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1277:   PetscFunctionBegin;
1278:   PetscBool perm_identity = PETSC_FALSE;
1279:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1280:   if (!info->levels && perm_identity) {
1281:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1282:   } else {
1283:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1284:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1285:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1286:   }
1287:   PetscFunctionReturn(PETSC_SUCCESS);
1288: }

1290: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1291: {
1292:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1294:   PetscFunctionBegin;
1295:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1296:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1297:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1298:   PetscFunctionReturn(PETSC_SUCCESS);
1299: }

1301: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1302: {
1303:   PetscFunctionBegin;
1304:   *type = MATSOLVERCUSPARSE;
1305:   PetscFunctionReturn(PETSC_SUCCESS);
1306: }

1308: /*MC
1309:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1310:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1311:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1312:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1313:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1314:   algorithms are not recommended. This class does NOT support direct solver operations.

1316:   Level: beginner

1318: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1319:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1320: M*/

1322: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1323: {
1324:   PetscInt n = A->rmap->n;

1326:   PetscFunctionBegin;
1327:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1328:   PetscCall(MatSetSizes(*B, n, n, n, n));
1329:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
1330:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

1332:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1333:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1334:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1335:     if (!A->boundtocpu) {
1336:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1337:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1338:     } else {
1339:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1340:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1341:     }
1342:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1343:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1344:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1345:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1346:     if (!A->boundtocpu) {
1347:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1348:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1349:     } else {
1350:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1351:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1352:     }
1353:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1354:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1355:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

1357:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1358:   (*B)->canuseordering = PETSC_TRUE;
1359:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
1360:   PetscFunctionReturn(PETSC_SUCCESS);
1361: }

1363: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1364: {
1365:   Mat_SeqAIJ                   *a    = (Mat_SeqAIJ *)A->data;
1366:   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1367:   Mat_SeqAIJCUSPARSETriFactors *fs   = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;

1369:   PetscFunctionBegin;
1370:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1371:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1372:     if (A->factortype == MAT_FACTOR_NONE) {
1373:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1374:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1375:     } else if (fs->csrVal) {
1376:       /* We have a factorized matrix on device and are able to copy it to host */
1377:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1378:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1379:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1380:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1381:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1382:   }
1383:   PetscFunctionReturn(PETSC_SUCCESS);
1384: }

1386: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (CUDA specialisation) */
1387: struct MatSeqAIJCUSPARSE_Policy {
1388:   typedef Mat_SeqAIJCUSPARSE           mat_struct_type;
1389:   typedef Mat_SeqAIJCUSPARSEMultStruct mult_struct_type;

1391:   static int storage_format_csr() { return (int)MAT_CUSPARSE_CSR; }
1392:   static int storage_format_ell() { return (int)MAT_CUSPARSE_ELL; }
1393:   static int storage_format_hyb() { return (int)MAT_CUSPARSE_HYB; }

1395:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
1396:   static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJCUSPARSECopyFromGPU(A); }
1397:   static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJCUSPARSEInvalidateTranspose(A, d); }
1398:   static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, t, r, C); }
1399:   static const char    *mat_type_name;

1401:   static PetscErrorCode Destroy(Mat A) { return MatSeqAIJCUSPARSE_Destroy(A); }
1402:   static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)spptr); }
1403:   static const char    *set_format_c;
1404:   static const char    *set_use_cpu_solve_c;
1405:   static const char    *product_seqdense_device_c;
1406:   static const char    *product_seqdense_c;
1407:   static const char    *product_self_c;
1408:   static const char    *seq_convert_hypre_c;

1410:   static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecCUDAGetArrayRead(v, a); }
1411:   static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecCUDARestoreArrayRead(v, a); }
1412:   static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecCUDAGetArrayWrite(v, a); }
1413:   static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecCUDARestoreArrayWrite(v, a); }
1414: };
1415: const char *MatSeqAIJCUSPARSE_Policy::mat_type_name             = MATSEQAIJCUSPARSE;
1416: const char *MatSeqAIJCUSPARSE_Policy::set_format_c              = "MatCUSPARSESetFormat_C";
1417: const char *MatSeqAIJCUSPARSE_Policy::set_use_cpu_solve_c       = "MatCUSPARSESetUseCPUSolve_C";
1418: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C";
1419: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_c        = "MatProductSetFromOptions_seqaijcusparse_seqdense_C";
1420: const char *MatSeqAIJCUSPARSE_Policy::product_self_c            = "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C";
1421: const char *MatSeqAIJCUSPARSE_Policy::seq_convert_hypre_c       = "MatConvert_seqaijcusparse_hypre_C";

1423: using MatSeqAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatSeqAIJCUSPARSE_Policy>;

1425: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1426: {
1427:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1428: }

1430: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1431: {
1432:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1433: }

1435: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1436: {
1437:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1438: }

1440: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1441: {
1442:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1443: }

1445: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1446: {
1447:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1448: }

1450: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1451: {
1452:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1453: }

1455: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1456: {
1457:   Mat_SeqAIJCUSPARSE *cusp;
1458:   CsrMatrix          *matrix;

1460:   PetscFunctionBegin;
1461:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1462:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1463:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1464:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1465:   matrix = (CsrMatrix *)cusp->mat->mat;

1467:   if (i) *i = matrix->row_offsets->data().get();
1468:   if (j) *j = matrix->column_indices->data().get();
1469:   if (a) *a = matrix->values->data().get();
1470:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1471:   PetscFunctionReturn(PETSC_SUCCESS);
1472: }

1474: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1475: {
1476:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1477:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
1478:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
1479:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
1480:   PetscBool                     both           = PETSC_TRUE;

1482:   PetscFunctionBegin;
1483:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1484:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1485:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1486:       CsrMatrix *matrix;
1487:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

1489:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1490:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1491:       matrix->values->assign(a->a, a->a + a->nz);
1492:       PetscCallCUDA(WaitForCUDA());
1493:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1494:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1495:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
1496:     } else {
1497:       PetscInt nnz;
1498:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1499:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
1500:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1501:       delete cusparsestruct->workVector;
1502:       delete cusparsestruct->rowoffsets_gpu;
1503:       cusparsestruct->workVector     = NULL;
1504:       cusparsestruct->rowoffsets_gpu = NULL;
1505:       try {
1506:         if (a->compressedrow.use) {
1507:           m    = a->compressedrow.nrows;
1508:           ii   = a->compressedrow.i;
1509:           ridx = a->compressedrow.rindex;
1510:         } else {
1511:           m    = A->rmap->n;
1512:           ii   = a->i;
1513:           ridx = NULL;
1514:         }
1515:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1516:         if (!a->a) {
1517:           nnz  = ii[m];
1518:           both = PETSC_FALSE;
1519:         } else nnz = a->nz;
1520:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

1522:         /* create cusparse matrix */
1523:         cusparsestruct->nrows = m;
1524:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
1525:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1526:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1527:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1529:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1530:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1531:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1532:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1533:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1534:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1535:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

1537:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1538:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1539:           /* set the matrix */
1540:           CsrMatrix *mat   = new CsrMatrix;
1541:           mat->num_rows    = m;
1542:           mat->num_cols    = A->cmap->n;
1543:           mat->num_entries = nnz;
1544:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY(m + 1));
1545:           mat->row_offsets->assign(ii, ii + m + 1);
1546:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY(nnz));
1547:           mat->column_indices->assign(a->j, a->j + nnz);

1549:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1550:           if (a->a) mat->values->assign(a->a, a->a + nnz);

1552:           /* assign the pointer */
1553:           matstruct->mat = mat;
1554:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1555:             PetscCallCUSPARSE(cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1556:           }
1557:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1558:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1559:         }

1561:         /* assign the compressed row indices */
1562:         if (a->compressedrow.use) {
1563:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
1564:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1565:           matstruct->cprowIndices->assign(ridx, ridx + m);
1566:           tmp = m;
1567:         } else {
1568:           cusparsestruct->workVector = NULL;
1569:           matstruct->cprowIndices    = NULL;
1570:           tmp                        = 0;
1571:         }
1572:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

1574:         /* assign the pointer */
1575:         cusparsestruct->mat = matstruct;
1576:       } catch (char *ex) {
1577:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
1578:       }
1579:       PetscCallCUDA(WaitForCUDA());
1580:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1581:       cusparsestruct->nonzerostate = A->nonzerostate;
1582:     }
1583:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1584:   }
1585:   PetscFunctionReturn(PETSC_SUCCESS);
1586: }

1588: struct VecCUDAPlusEquals {
1589:   template <typename Tuple>
1590:   __host__ __device__ void operator()(Tuple t)
1591:   {
1592:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1593:   }
1594: };

1596: struct VecCUDAEquals {
1597:   template <typename Tuple>
1598:   __host__ __device__ void operator()(Tuple t)
1599:   {
1600:     thrust::get<1>(t) = thrust::get<0>(t);
1601:   }
1602: };

1604: struct VecCUDAEqualsReverse {
1605:   template <typename Tuple>
1606:   __host__ __device__ void operator()(Tuple t)
1607:   {
1608:     thrust::get<0>(t) = thrust::get<1>(t);
1609:   }
1610: };

1612: struct MatProductCtx_MatMatCusparse {
1613:   PetscBool      cisdense;
1614:   PetscScalar   *Bt;
1615:   Mat            X;
1616:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1617:   PetscLogDouble flops;
1618:   CsrMatrix     *Bcsr;

1620:   cusparseSpMatDescr_t  matSpBDescr;
1621:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
1622:   cusparseDnMatDescr_t  matBDescr;
1623:   cusparseDnMatDescr_t  matCDescr;
1624:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1625:   void                 *dBuffer4;
1626:   void                 *dBuffer5;
1627:   size_t                mmBufferSize;
1628:   void                 *mmBuffer;
1629:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1630:   cusparseSpGEMMDescr_t spgemmDesc;
1631: };

1633: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
1634: {
1635:   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;

1637:   PetscFunctionBegin;
1638:   PetscCallCUDA(cudaFree(mmdata->Bt));
1639:   delete mmdata->Bcsr;
1640:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
1641:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1642:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1643:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1644:   PetscCallCUDA(cudaFree(mmdata->dBuffer4));
1645:   PetscCallCUDA(cudaFree(mmdata->dBuffer5));
1646:   PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1647:   PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
1648:   PetscCall(MatDestroy(&mmdata->X));
1649:   PetscCall(PetscFree(mmdata));
1650:   PetscFunctionReturn(PETSC_SUCCESS);
1651: }

1653: #include <../src/mat/impls/dense/seq/dense.h>

1655: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1656: {
1657:   Mat_Product                  *product = C->product;
1658:   Mat                           A, B;
1659:   PetscInt                      m, n, blda, clda;
1660:   PetscBool                     flg, biscuda;
1661:   Mat_SeqAIJCUSPARSE           *cusp;
1662:   cusparseOperation_t           opA;
1663:   const PetscScalar            *barray;
1664:   PetscScalar                  *carray;
1665:   MatProductCtx_MatMatCusparse *mmdata;
1666:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1667:   CsrMatrix                    *csrmat;

1669:   PetscFunctionBegin;
1670:   MatCheckProduct(C, 1);
1671:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1672:   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
1673:   A      = product->A;
1674:   B      = product->B;
1675:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1676:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1677:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1678:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1679:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1680:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1681:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1682:   switch (product->type) {
1683:   case MATPRODUCT_AB:
1684:   case MATPRODUCT_PtAP:
1685:     mat = cusp->mat;
1686:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1687:     m   = A->rmap->n;
1688:     n   = B->cmap->n;
1689:     break;
1690:   case MATPRODUCT_AtB:
1691:     if (!A->form_explicit_transpose) {
1692:       mat = cusp->mat;
1693:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1694:     } else {
1695:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1696:       mat = cusp->matTranspose;
1697:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1698:     }
1699:     m = A->cmap->n;
1700:     n = B->cmap->n;
1701:     break;
1702:   case MATPRODUCT_ABt:
1703:   case MATPRODUCT_RARt:
1704:     mat = cusp->mat;
1705:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1706:     m   = A->rmap->n;
1707:     n   = B->rmap->n;
1708:     break;
1709:   default:
1710:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1711:   }
1712:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
1713:   csrmat = (CsrMatrix *)mat->mat;
1714:   /* if the user passed a CPU matrix, copy the data to the GPU */
1715:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
1716:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1717:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

1719:   PetscCall(MatDenseGetLDA(B, &blda));
1720:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1721:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1722:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1723:   } else {
1724:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1725:     PetscCall(MatDenseGetLDA(C, &clda));
1726:   }

1728:   PetscCall(PetscLogGpuTimeBegin());
1729:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1730: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
1731:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
1732: #else
1733:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
1734: #endif

1736:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
1737:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1738:     size_t mmBufferSize;
1739:     if (mmdata->initialized && mmdata->Blda != blda) {
1740:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1741:       mmdata->matBDescr = NULL;
1742:     }
1743:     if (!mmdata->matBDescr) {
1744:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1745:       mmdata->Blda = blda;
1746:     }

1748:     if (mmdata->initialized && mmdata->Clda != clda) {
1749:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1750:       mmdata->matCDescr = NULL;
1751:     }
1752:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1753:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1754:       mmdata->Clda = clda;
1755:     }

1757: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
1758:     if (matADescr) {
1759:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
1760:       matADescr = NULL;
1761:     }
1762: #endif

1764:     if (!matADescr) {
1765:       PetscCallCUSPARSE(cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1766:     }

1768:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

1770:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1771:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1772:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
1773:       mmdata->mmBufferSize = mmBufferSize;
1774:     }

1776: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
1777:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1778: #endif

1780:     mmdata->initialized = PETSC_TRUE;
1781:   } else {
1782:     /* to be safe, always update pointers of the mats */
1783:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
1784:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1785:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1786:   }

1788:   /* do cusparseSpMM, which supports transpose on B */
1789:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

1791:   PetscCall(PetscLogGpuTimeEnd());
1792:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1793:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1794:   if (product->type == MATPRODUCT_RARt) {
1795:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1796:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1797:   } else if (product->type == MATPRODUCT_PtAP) {
1798:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1799:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1800:   } else {
1801:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1802:   }
1803:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1804:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1805:   PetscFunctionReturn(PETSC_SUCCESS);
1806: }

1808: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1809: {
1810:   Mat_Product                  *product = C->product;
1811:   Mat                           A, B;
1812:   PetscInt                      m, n;
1813:   PetscBool                     cisdense, flg;
1814:   MatProductCtx_MatMatCusparse *mmdata;
1815:   Mat_SeqAIJCUSPARSE           *cusp;

1817:   PetscFunctionBegin;
1818:   MatCheckProduct(C, 1);
1819:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1820:   A = product->A;
1821:   B = product->B;
1822:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1823:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1824:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1825:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1826:   switch (product->type) {
1827:   case MATPRODUCT_AB:
1828:     m = A->rmap->n;
1829:     n = B->cmap->n;
1830:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
1831:     break;
1832:   case MATPRODUCT_AtB:
1833:     m = A->cmap->n;
1834:     n = B->cmap->n;
1835:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
1836:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1837:     break;
1838:   case MATPRODUCT_ABt:
1839:     m = A->rmap->n;
1840:     n = B->rmap->n;
1841:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
1842:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1843:     break;
1844:   case MATPRODUCT_PtAP:
1845:     m = B->cmap->n;
1846:     n = B->cmap->n;
1847:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
1848:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1849:     break;
1850:   case MATPRODUCT_RARt:
1851:     m = B->rmap->n;
1852:     n = B->rmap->n;
1853:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
1854:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1855:     break;
1856:   default:
1857:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1858:   }
1859:   PetscCall(MatSetSizes(C, m, n, m, n));
1860:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1861:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1862:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

1864:   /* product data */
1865:   PetscCall(PetscNew(&mmdata));
1866:   mmdata->cisdense = cisdense;
1867:   /* for these products we need intermediate storage */
1868:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1869:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1870:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
1871:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1872:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1873:     } else {
1874:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1875:     }
1876:   }
1877:   C->product->data    = mmdata;
1878:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

1880:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1881:   PetscFunctionReturn(PETSC_SUCCESS);
1882: }

1884: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1885: {
1886:   Mat_Product                  *product = C->product;
1887:   Mat                           A, B;
1888:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
1889:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
1890:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1891:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
1892:   PetscBool                     flg;
1893:   MatProductType                ptype;
1894:   MatProductCtx_MatMatCusparse *mmdata;
1895:   cusparseSpMatDescr_t          BmatSpDescr;
1896:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

1898:   PetscFunctionBegin;
1899:   MatCheckProduct(C, 1);
1900:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1901:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
1902:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1903:   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
1904:   A      = product->A;
1905:   B      = product->B;
1906:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1907:     mmdata->reusesym = PETSC_FALSE;
1908:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
1909:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1910:     Cmat = Ccusp->mat;
1911:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1912:     Ccsr = (CsrMatrix *)Cmat->mat;
1913:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1914:     goto finalize;
1915:   }
1916:   if (!c->nz) goto finalize;
1917:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1918:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1919:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
1920:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1921:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1922:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1923:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1924:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
1925:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1926:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1927:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1928:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1929:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1930:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

1932:   ptype = product->type;
1933:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1934:     ptype = MATPRODUCT_AB;
1935:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1936:   }
1937:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1938:     ptype = MATPRODUCT_AB;
1939:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1940:   }
1941:   switch (ptype) {
1942:   case MATPRODUCT_AB:
1943:     Amat = Acusp->mat;
1944:     Bmat = Bcusp->mat;
1945:     break;
1946:   case MATPRODUCT_AtB:
1947:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1948:     Amat = Acusp->matTranspose;
1949:     Bmat = Bcusp->mat;
1950:     break;
1951:   case MATPRODUCT_ABt:
1952:     Amat = Acusp->mat;
1953:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
1954:     Bmat = Bcusp->matTranspose;
1955:     break;
1956:   default:
1957:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1958:   }
1959:   Cmat = Ccusp->mat;
1960:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1961:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1962:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1963:   Acsr = (CsrMatrix *)Amat->mat;
1964:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1965:   Ccsr = (CsrMatrix *)Cmat->mat;
1966:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
1967:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
1968:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1969:   PetscCall(PetscLogGpuTimeBegin());
1970:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
1971:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
1972:   PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
1973:   PetscCallCUSPARSE(cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
1974:   PetscCall(PetscLogGpuFlops(mmdata->flops));
1975:   PetscCallCUDA(WaitForCUDA());
1976:   PetscCall(PetscLogGpuTimeEnd());
1977:   C->offloadmask = PETSC_OFFLOAD_GPU;
1978: finalize:
1979:   /* shorter version of MatAssemblyEnd_SeqAIJ */
1980:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
1981:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
1982:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
1983:   c->reallocs = 0;
1984:   C->info.mallocs += 0;
1985:   C->info.nz_unneeded = 0;
1986:   C->assembled = C->was_assembled = PETSC_TRUE;
1987:   C->num_ass++;
1988:   PetscFunctionReturn(PETSC_SUCCESS);
1989: }

1991: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1992: {
1993:   Mat_Product                  *product = C->product;
1994:   Mat                           A, B;
1995:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
1996:   Mat_SeqAIJ                   *a, *b, *c;
1997:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1998:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
1999:   PetscInt                      i, j, m, n, k;
2000:   PetscBool                     flg;
2001:   MatProductType                ptype;
2002:   MatProductCtx_MatMatCusparse *mmdata;
2003:   PetscLogDouble                flops;
2004:   PetscBool                     biscompressed, ciscompressed;
2005:   int64_t                       C_num_rows1, C_num_cols1, C_nnz1;
2006:   cusparseSpMatDescr_t          BmatSpDescr;
2007:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2008:   size_t                        bufSize2;

2010:   PetscFunctionBegin;
2011:   MatCheckProduct(C, 1);
2012:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2013:   A = product->A;
2014:   B = product->B;
2015:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2016:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2017:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2018:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2019:   a = (Mat_SeqAIJ *)A->data;
2020:   b = (Mat_SeqAIJ *)B->data;
2021:   /* product data */
2022:   PetscCall(PetscNew(&mmdata));
2023:   C->product->data    = mmdata;
2024:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

2026:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2027:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2028:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2029:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2030:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2031:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

2033:   ptype = product->type;
2034:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2035:     ptype                                          = MATPRODUCT_AB;
2036:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2037:   }
2038:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2039:     ptype                                          = MATPRODUCT_AB;
2040:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2041:   }
2042:   biscompressed = PETSC_FALSE;
2043:   ciscompressed = PETSC_FALSE;
2044:   switch (ptype) {
2045:   case MATPRODUCT_AB:
2046:     m    = A->rmap->n;
2047:     n    = B->cmap->n;
2048:     k    = A->cmap->n;
2049:     Amat = Acusp->mat;
2050:     Bmat = Bcusp->mat;
2051:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2052:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2053:     break;
2054:   case MATPRODUCT_AtB:
2055:     m = A->cmap->n;
2056:     n = B->cmap->n;
2057:     k = A->rmap->n;
2058:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2059:     Amat = Acusp->matTranspose;
2060:     Bmat = Bcusp->mat;
2061:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2062:     break;
2063:   case MATPRODUCT_ABt:
2064:     m = A->rmap->n;
2065:     n = B->rmap->n;
2066:     k = A->cmap->n;
2067:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2068:     Amat = Acusp->mat;
2069:     Bmat = Bcusp->matTranspose;
2070:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2071:     break;
2072:   default:
2073:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2074:   }

2076:   /* create cusparse matrix */
2077:   PetscCall(MatSetSizes(C, m, n, m, n));
2078:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2079:   c     = (Mat_SeqAIJ *)C->data;
2080:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2081:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2082:   Ccsr  = new CsrMatrix;

2084:   c->compressedrow.use = ciscompressed;
2085:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2086:     c->compressedrow.nrows = a->compressedrow.nrows;
2087:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2088:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2089:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2090:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2091:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2092:   } else {
2093:     c->compressedrow.nrows  = 0;
2094:     c->compressedrow.i      = NULL;
2095:     c->compressedrow.rindex = NULL;
2096:     Ccusp->workVector       = NULL;
2097:     Cmat->cprowIndices      = NULL;
2098:   }
2099:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2100:   Ccusp->mat        = Cmat;
2101:   Ccusp->mat->mat   = Ccsr;
2102:   Ccsr->num_rows    = Ccusp->nrows;
2103:   Ccsr->num_cols    = n;
2104:   Ccsr->row_offsets = new THRUSTINTARRAY(Ccusp->nrows + 1);
2105:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2106:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2107:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2108:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2109:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2110:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2111:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2112:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2113:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2114:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2115:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2116:     c->nz                = 0;
2117:     Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2118:     Ccsr->values         = new THRUSTARRAY(c->nz);
2119:     goto finalizesym;
2120:   }

2122:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2123:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2124:   Acsr = (CsrMatrix *)Amat->mat;
2125:   if (!biscompressed) {
2126:     Bcsr        = (CsrMatrix *)Bmat->mat;
2127:     BmatSpDescr = Bmat->matDescr;
2128:   } else { /* we need to use row offsets for the full matrix */
2129:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2130:     Bcsr                 = new CsrMatrix;
2131:     Bcsr->num_rows       = B->rmap->n;
2132:     Bcsr->num_cols       = cBcsr->num_cols;
2133:     Bcsr->num_entries    = cBcsr->num_entries;
2134:     Bcsr->column_indices = cBcsr->column_indices;
2135:     Bcsr->values         = cBcsr->values;
2136:     if (!Bcusp->rowoffsets_gpu) {
2137:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
2138:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2139:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2140:     }
2141:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2142:     mmdata->Bcsr      = Bcsr;
2143:     if (Bcsr->num_rows && Bcsr->num_cols) {
2144:       PetscCallCUSPARSE(cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2145:     }
2146:     BmatSpDescr = mmdata->matSpBDescr;
2147:   }
2148:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2149:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2150:   /* precompute flops count */
2151:   if (ptype == MATPRODUCT_AB) {
2152:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2153:       const PetscInt st = a->i[i];
2154:       const PetscInt en = a->i[i + 1];
2155:       for (j = st; j < en; j++) {
2156:         const PetscInt brow = a->j[j];
2157:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2158:       }
2159:     }
2160:   } else if (ptype == MATPRODUCT_AtB) {
2161:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2162:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2163:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2164:       flops += (2. * anzi) * bnzi;
2165:     }
2166:   } else { /* TODO */
2167:     flops = 0.;
2168:   }

2170:   mmdata->flops = flops;
2171:   PetscCall(PetscLogGpuTimeBegin());

2173:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2174:   // cuda-12.2 requires non-null csrRowOffsets
2175:   PetscCallCUSPARSE(cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2176:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2177:   // Note that cusparseSpGEMMreuse is deprecated in CUDA 13.2.1

2179:   PetscCheck(!PetscDefined(USE_64BIT_INDICES) || PETSC_PKG_CUDA_VERSION_GE(13, 0, 0), PETSC_COMM_SELF, PETSC_ERR_SUP_SYS, "cusparseSpGEMM did not support 64-bit indices before CUDA 13.0. Update your CUDA installation.");
2180:   /* ask bufferSize bytes for external memory */
2181:   PetscCallCUSPARSE(cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2182:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2183:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2184:   PetscCallCUSPARSE(cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2185:   /* ask bufferSize again bytes for external memory */
2186:   PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2187:   /* The CUSPARSE documentation is not clear, nor the API
2188:      We need both buffers to perform the operations properly!
2189:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2190:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2191:      is stored in the descriptor! What a messy API... */
2192:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2193:   /* compute the intermediate product of A * B */
2194:   PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2195:   /* get matrix C non-zero entries C_nnz1 */
2196:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2197:   PetscCall(PetscIntCast(C_nnz1, &c->nz));
2198:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2199:                       mmdata->mmBufferSize / 1024));
2200:   Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2201:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2202:   Ccsr->values = new THRUSTARRAY(c->nz);
2203:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2204:   if (c->nz) PetscCallCUSPARSE(cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2205:   PetscCallCUSPARSE(cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2206:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2207:   PetscCall(PetscLogGpuTimeEnd());
2208: finalizesym:
2209:   c->free_a = PETSC_TRUE;
2210:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2211:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2212:   c->free_ij = PETSC_TRUE;

2214:   PetscInt *d_i = c->i;
2215:   if (ciscompressed) d_i = c->compressedrow.i;
2216:   PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2217:   PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2218:   if (ciscompressed) { /* need to expand host row offsets */
2219:     PetscInt r = 0;
2220:     c->i[0]    = 0;
2221:     for (k = 0; k < c->compressedrow.nrows; k++) {
2222:       const PetscInt next = c->compressedrow.rindex[k];
2223:       const PetscInt old  = c->compressedrow.i[k];
2224:       for (; r < next; r++) c->i[r + 1] = old;
2225:     }
2226:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2227:   }
2228:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2229:   PetscCall(PetscMalloc1(m, &c->ilen));
2230:   PetscCall(PetscMalloc1(m, &c->imax));
2231:   c->maxnz         = c->nz;
2232:   c->nonzerorowcnt = 0;
2233:   c->rmax          = 0;
2234:   for (k = 0; k < m; k++) {
2235:     const PetscInt nn = c->i[k + 1] - c->i[k];
2236:     c->ilen[k] = c->imax[k] = nn;
2237:     c->nonzerorowcnt += (PetscInt)!!nn;
2238:     c->rmax = PetscMax(c->rmax, nn);
2239:   }
2240:   PetscCall(PetscMalloc1(c->nz, &c->a));
2241:   Ccsr->num_entries = c->nz;

2243:   C->nonzerostate++;
2244:   PetscCall(PetscLayoutSetUp(C->rmap));
2245:   PetscCall(PetscLayoutSetUp(C->cmap));
2246:   Ccusp->nonzerostate = C->nonzerostate;
2247:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2248:   C->preallocated     = PETSC_TRUE;
2249:   C->assembled        = PETSC_FALSE;
2250:   C->was_assembled    = PETSC_FALSE;
2251:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2252:     mmdata->reusesym = PETSC_TRUE;
2253:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2254:   }
2255:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2256:   PetscFunctionReturn(PETSC_SUCCESS);
2257: }

2259: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2261: /* handles sparse or dense B */
2262: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2263: {
2264:   Mat_Product *product = mat->product;
2265:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2267:   PetscFunctionBegin;
2268:   MatCheckProduct(mat, 1);
2269:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2270:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
2271:   if (product->type == MATPRODUCT_ABC) {
2272:     Ciscusp = PETSC_FALSE;
2273:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
2274:   }
2275:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2276:     PetscBool usecpu = PETSC_FALSE;
2277:     switch (product->type) {
2278:     case MATPRODUCT_AB:
2279:       if (product->api_user) {
2280:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2281:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2282:         PetscOptionsEnd();
2283:       } else {
2284:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2285:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2286:         PetscOptionsEnd();
2287:       }
2288:       break;
2289:     case MATPRODUCT_AtB:
2290:       if (product->api_user) {
2291:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2292:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2293:         PetscOptionsEnd();
2294:       } else {
2295:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2296:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2297:         PetscOptionsEnd();
2298:       }
2299:       break;
2300:     case MATPRODUCT_PtAP:
2301:       if (product->api_user) {
2302:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2303:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2304:         PetscOptionsEnd();
2305:       } else {
2306:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2307:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2308:         PetscOptionsEnd();
2309:       }
2310:       break;
2311:     case MATPRODUCT_RARt:
2312:       if (product->api_user) {
2313:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2314:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2315:         PetscOptionsEnd();
2316:       } else {
2317:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2318:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2319:         PetscOptionsEnd();
2320:       }
2321:       break;
2322:     case MATPRODUCT_ABC:
2323:       if (product->api_user) {
2324:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2325:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2326:         PetscOptionsEnd();
2327:       } else {
2328:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2329:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2330:         PetscOptionsEnd();
2331:       }
2332:       break;
2333:     default:
2334:       break;
2335:     }
2336:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2337:   }
2338:   /* dispatch */
2339:   if (isdense) {
2340:     switch (product->type) {
2341:     case MATPRODUCT_AB:
2342:     case MATPRODUCT_AtB:
2343:     case MATPRODUCT_ABt:
2344:     case MATPRODUCT_PtAP:
2345:     case MATPRODUCT_RARt:
2346:       if (product->A->boundtocpu) {
2347:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2348:       } else {
2349:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2350:       }
2351:       break;
2352:     case MATPRODUCT_ABC:
2353:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2354:       break;
2355:     default:
2356:       break;
2357:     }
2358:   } else if (Biscusp && Ciscusp) {
2359:     switch (product->type) {
2360:     case MATPRODUCT_AB:
2361:     case MATPRODUCT_AtB:
2362:     case MATPRODUCT_ABt:
2363:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2364:       break;
2365:     case MATPRODUCT_PtAP:
2366:     case MATPRODUCT_RARt:
2367:     case MATPRODUCT_ABC:
2368:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2369:       break;
2370:     default:
2371:       break;
2372:     }
2373:   } else { /* fallback for AIJ */
2374:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2375:   }
2376:   PetscFunctionReturn(PETSC_SUCCESS);
2377: }

2379: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2380: {
2381:   PetscFunctionBegin;
2382:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2383:   PetscFunctionReturn(PETSC_SUCCESS);
2384: }

2386: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2387: {
2388:   PetscFunctionBegin;
2389:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2390:   PetscFunctionReturn(PETSC_SUCCESS);
2391: }

2393: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2394: {
2395:   PetscFunctionBegin;
2396:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2397:   PetscFunctionReturn(PETSC_SUCCESS);
2398: }

2400: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2401: {
2402:   PetscFunctionBegin;
2403:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2404:   PetscFunctionReturn(PETSC_SUCCESS);
2405: }

2407: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2408: {
2409:   PetscFunctionBegin;
2410:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2411:   PetscFunctionReturn(PETSC_SUCCESS);
2412: }

2414: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2415: {
2416:   int i = blockIdx.x * blockDim.x + threadIdx.x;
2417:   if (i < n) y[idx[i]] += x[i];
2418: }

2420: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2421: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2422: {
2423:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2424:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2425:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2426:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
2427:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2428:   PetscBool                     compressed;
2429:   PetscInt                      nx, ny;

2431:   PetscFunctionBegin;
2432:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2433:   if (!a->nz) {
2434:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
2435:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
2436:     PetscFunctionReturn(PETSC_SUCCESS);
2437:   }
2438:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2439:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2440:   if (!trans) {
2441:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2442:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2443:   } else {
2444:     if (herm || !A->form_explicit_transpose) {
2445:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2446:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2447:     } else {
2448:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2449:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
2450:     }
2451:   }
2452:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2453:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2455:   try {
2456:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
2457:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2458:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

2460:     PetscCall(PetscLogGpuTimeBegin());
2461:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2462:       /* z = A x + beta y.
2463:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2464:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2465:       */
2466:       xptr = xarray;
2467:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2468:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2469:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2470:           allocated to accommodate different uses. So we get the length info directly from mat.
2471:        */
2472:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2473:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2474:         nx             = mat->num_cols; // since y = Ax
2475:         ny             = mat->num_rows;
2476:       }
2477:     } else {
2478:       /* z = A^T x + beta y
2479:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2480:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2481:        */
2482:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2483:       dptr = zarray;
2484:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2485:       if (compressed) { /* Scatter x to work vector */
2486:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

2488:         thrust::for_each(
2489: #if PetscDefined(HAVE_THRUST_ASYNC)
2490:           thrust::cuda::par.on(PetscDefaultCudaStream),
2491: #endif
2492:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2493:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
2494:       }
2495:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2496:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2497:         nx             = mat->num_rows; // since y = A^T x
2498:         ny             = mat->num_cols;
2499:       }
2500:     }

2502:     /* csr_spmv does y = alpha op(A) x + beta y */
2503:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2504: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2505:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
2506: #else
2507:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
2508: #endif

2510:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2511: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2512:       if (!matDescr) {
2513:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2514:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2515:       }
2516: #endif

2518:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2519:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
2520:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
2521:         PetscCallCUSPARSE(
2522:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
2523:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
2524: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
2525:         PetscCallCUSPARSE(
2526:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2527: #endif
2528:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2529:       } else {
2530:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2531:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
2532:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
2533:       }

2535:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));

2537:     } else {
2538:       if (cusparsestruct->nrows) {
2539:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2540:       }
2541:     }
2542:     PetscCall(PetscLogGpuTimeEnd());

2544:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2545:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
2546:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2547:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
2548:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2549:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2550:         }
2551:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2552:         PetscCall(VecSeq_CUDA::Set(zz, 0));
2553:       }

2555:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2556:       if (compressed) {
2557:         PetscCall(PetscLogGpuTimeBegin());
2558:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
2559:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
2560:         PetscCall(PetscLogGpuTimeEnd());
2561:       }
2562:     } else {
2563:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2564:     }
2565:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
2566:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
2567:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
2568:   } catch (char *ex) {
2569:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2570:   }
2571:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2572:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2573:   PetscFunctionReturn(PETSC_SUCCESS);
2574: }

2576: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2577: {
2578:   PetscFunctionBegin;
2579:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2580:   PetscFunctionReturn(PETSC_SUCCESS);
2581: }

2583: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
2584: {
2585:   PetscFunctionBegin;
2586:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetDiagonal(A, diag));
2587:   PetscFunctionReturn(PETSC_SUCCESS);
2588: }

2590: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec ll, Vec rr)
2591: {
2592:   PetscFunctionBegin;
2593:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2594:   PetscFunctionReturn(PETSC_SUCCESS);
2595: }

2597: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
2598: {
2599:   PetscFunctionBegin;
2600:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode));
2601:   PetscFunctionReturn(PETSC_SUCCESS);
2602: }

2604: /*@
2605:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs

2607:   Collective

2609:   Input Parameters:
2610: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2611: . m    - number of rows
2612: . n    - number of columns
2613: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
2614: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

2616:   Output Parameter:
2617: . A - the matrix

2619:   Level: intermediate

2621:   Notes:
2622:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
2623:   calculations. For good matrix assembly performance the user should preallocate the matrix
2624:   storage by setting the parameter `nz` (or the array `nnz`).

2626:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2627:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
2628:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

2630:   The AIJ format, also called
2631:   compressed row storage, is fully compatible with standard Fortran
2632:   storage.  That is, the stored row and column indices can begin at
2633:   either one (as in Fortran) or zero.

2635:   Specify the preallocated storage with either nz or nnz (not both).
2636:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2637:   allocation.

2639:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

2641: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
2642:           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
2643: @*/
2644: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2645: {
2646:   return MatSeqAIJCUSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2647: }

2649: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2650: {
2651:   return MatSeqAIJCUSPARSE_CUPM_t::Destroy(A);
2652: }

2654: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2655: {
2656:   PetscFunctionBegin;
2657:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2658:   PetscFunctionReturn(PETSC_SUCCESS);
2659: }

2661: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2662: {
2663:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2664:   Mat_SeqAIJCUSPARSE *cy;
2665:   Mat_SeqAIJCUSPARSE *cx;
2666:   CsrMatrix          *csry, *csrx;

2668:   PetscFunctionBegin;
2669:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
2670:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
2671:   if (X->ops->axpy != Y->ops->axpy) {
2672:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2673:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2674:     PetscFunctionReturn(PETSC_SUCCESS);
2675:   }
2676:   /* if we are here, it means both matrices are bound to GPU */
2677:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
2678:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
2679:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2680:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2681:   csry = (CsrMatrix *)cy->mat->mat;
2682:   csrx = (CsrMatrix *)cx->mat->mat;
2683:   /* see if we can turn this into a cublas axpy */
2684:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2685:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2686:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2687:     if (eq) str = SAME_NONZERO_PATTERN;
2688:   }
2689:   /* spgeam is buggy with one column */
2690:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

2692: #if !defined(PETSC_USE_64BIT_INDICES) // cusparseScsrgeam2 etc. do not support 64bit indices
2693:   if (str == SUBSET_NONZERO_PATTERN) {
2694:     PetscScalar       *ay, b = 1.0;
2695:     const PetscScalar *ax;
2696:     size_t             bufferSize;
2697:     void              *buffer;

2699:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
2700:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
2701:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
2702:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2703:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2704:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
2705:     PetscCall(PetscLogGpuTimeBegin());
2706:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2707:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2708:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2709:     PetscCall(PetscLogGpuTimeEnd());
2710:     PetscCallCUDA(cudaFree(buffer));

2712:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
2713:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
2714:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
2715:   } else
2716: #endif
2717:     if (str == SAME_NONZERO_PATTERN) {
2718:     PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2719:   } else {
2720:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2721:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2722:   }
2723:   PetscFunctionReturn(PETSC_SUCCESS);
2724: }

2726: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
2727: {
2728:   PetscFunctionBegin;
2729:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Scale(Y, a));
2730:   PetscFunctionReturn(PETSC_SUCCESS);
2731: }

2733: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2734: {
2735:   PetscFunctionBegin;
2736:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::ZeroEntries(A));
2737:   PetscFunctionReturn(PETSC_SUCCESS);
2738: }

2740: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(Mat A, PetscMemType *m)
2741: {
2742:   PetscFunctionBegin;
2743:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetCurrentMemType(A, m));
2744:   PetscFunctionReturn(PETSC_SUCCESS);
2745: }

2747: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
2748: {
2749:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

2751:   PetscFunctionBegin;
2752:   if (A->factortype != MAT_FACTOR_NONE) {
2753:     A->boundtocpu = flg;
2754:     PetscFunctionReturn(PETSC_SUCCESS);
2755:   }
2756:   if (flg) {
2757:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

2759:     A->ops->scale                     = MatScale_SeqAIJ;
2760:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
2761:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJ;
2762:     A->ops->axpy                      = MatAXPY_SeqAIJ;
2763:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2764:     A->ops->mult                      = MatMult_SeqAIJ;
2765:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2766:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2767:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2768:     A->ops->multhermitiantranspose    = NULL;
2769:     A->ops->multhermitiantransposeadd = NULL;
2770:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
2771:     A->ops->getcurrentmemtype         = NULL;
2772:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2773:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2774:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
2775:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
2776:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2777:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2778:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
2779:   } else {
2780:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
2781:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
2782:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJCUSPARSE;
2783:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
2784:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
2785:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2786:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2787:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2788:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2789:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2790:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2791:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
2792:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
2793:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
2794:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
2795:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
2796:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
2797:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
2798:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
2799:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

2801:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
2802:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2803:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2804:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
2805:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
2806:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2807:   }
2808:   A->boundtocpu = flg;
2809:   a->inode.use  = (flg && a->inode.size_csr) ? PETSC_TRUE : PETSC_FALSE;
2810:   PetscFunctionReturn(PETSC_SUCCESS);
2811: }

2813: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
2814: {
2815:   Mat B;

2817:   PetscFunctionBegin;
2818:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
2819:   if (reuse == MAT_INITIAL_MATRIX) {
2820:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2821:   } else if (reuse == MAT_REUSE_MATRIX) {
2822:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2823:   }
2824:   B = *newmat;

2826:   PetscCall(PetscFree(B->defaultvectype));
2827:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

2829:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2830:     if (B->factortype == MAT_FACTOR_NONE) {
2831:       Mat_SeqAIJCUSPARSE *spptr;
2832:       PetscCall(PetscNew(&spptr));
2833:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2834:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2835:       spptr->format     = MAT_CUSPARSE_CSR;
2836:       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
2837:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2838:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
2839:       B->spptr          = spptr;
2840:     } else {
2841:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2843:       PetscCall(PetscNew(&spptr));
2844:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2845:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2846:       B->spptr = spptr;
2847:     }
2848:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2849:   }
2850:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
2851:   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
2852:   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
2853:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
2854:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
2855:   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
2856:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;

2858:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
2859:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
2860:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
2861: #if defined(PETSC_HAVE_HYPRE)
2862:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
2863: #endif
2864:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
2865:   PetscFunctionReturn(PETSC_SUCCESS);
2866: }

2868: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2869: {
2870:   PetscFunctionBegin;
2871:   PetscCall(MatCreate_SeqAIJ(B));
2872:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
2873:   PetscFunctionReturn(PETSC_SUCCESS);
2874: }

2876: /*MC
2877:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

2879:    Options Database Keys:
2880: +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
2881: .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2882:                                            Other options include ell (ellpack) or hyb (hybrid).
2883: .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2884: -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU

2886:   Level: beginner

2888:   Notes:
2889:   These matrices can be in either CSR, ELL, or HYB format.

2891:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

2893:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
2894:   if some integer values passed in do not fit in `int`.

2896: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2897: M*/

2899: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2900: {
2901:   PetscFunctionBegin;
2902:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
2903:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
2904:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
2905:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
2906:   PetscFunctionReturn(PETSC_SUCCESS);
2907: }

2909: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
2910: {
2911:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

2913:   PetscFunctionBegin;
2914:   if (cusp) {
2915:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2916:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2917:     delete cusp->workVector;
2918:     delete cusp->rowoffsets_gpu;
2919:     delete cusp->csr2csc_i;
2920:     delete cusp->coords;
2921:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
2922:     PetscCall(PetscFree(mat->spptr));
2923:   }
2924:   PetscFunctionReturn(PETSC_SUCCESS);
2925: }

2927: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2928: {
2929:   PetscFunctionBegin;
2930:   if (*mat) {
2931:     delete (*mat)->values;
2932:     delete (*mat)->column_indices;
2933:     delete (*mat)->row_offsets;
2934:     delete *mat;
2935:     *mat = 0;
2936:   }
2937:   PetscFunctionReturn(PETSC_SUCCESS);
2938: }

2940: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
2941: {
2942:   CsrMatrix *mat;

2944:   PetscFunctionBegin;
2945:   if (*matstruct) {
2946:     if ((*matstruct)->mat) {
2947:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
2948:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2949:       } else {
2950:         mat = (CsrMatrix *)(*matstruct)->mat;
2951:         PetscCall(CsrMatrix_Destroy(&mat));
2952:       }
2953:     }
2954:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
2955:     delete (*matstruct)->cprowIndices;
2956:     PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
2957:     PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
2958:     PetscCallCUDA(cudaFree((*matstruct)->beta_one));

2960:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2961:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

2963:     for (int i = 0; i < 3; i++) {
2964:       if (mdata->cuSpMV[i].initialized) {
2965:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
2966:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
2967:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
2968: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2969:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
2970:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
2971: #endif
2972:       }
2973:     }
2974:     delete *matstruct;
2975:     *matstruct = NULL;
2976:   }
2977:   PetscFunctionReturn(PETSC_SUCCESS);
2978: }

2980: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
2981: {
2982:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

2984:   PetscFunctionBegin;
2985:   if (fs) {
2986:     delete fs->rpermIndices;
2987:     delete fs->cpermIndices;
2988:     fs->rpermIndices  = NULL;
2989:     fs->cpermIndices  = NULL;
2990:     fs->init_dev_prop = PETSC_FALSE;
2991:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
2992:     PetscCallCUDA(cudaFree(fs->csrColIdx));
2993:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
2994:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
2995:     PetscCallCUDA(cudaFree(fs->csrVal));
2996:     PetscCallCUDA(cudaFree(fs->diag));
2997:     PetscCallCUDA(cudaFree(fs->X));
2998:     PetscCallCUDA(cudaFree(fs->Y));
2999:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3000:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3001:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3002:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3003:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3004:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3005:     if (fs->spMatDescr_L) PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3006:     if (fs->spMatDescr_U) PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3007:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3008:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3009:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3010:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3011:     if (fs->dnVecDescr_X) PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3012:     if (fs->dnVecDescr_Y) PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3013:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3014:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3015:     PetscCall(PetscFree(fs->csrRowPtr_h));
3016:     PetscCall(PetscFree(fs->csrVal_h));
3017:     PetscCall(PetscFree(fs->diag_h));
3018:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3019:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3020:   }
3021:   PetscFunctionReturn(PETSC_SUCCESS);
3022: }

3024: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3025: {
3026:   PetscFunctionBegin;
3027:   if (*trifactors) {
3028:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3029:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3030:     PetscCall(PetscFree(*trifactors));
3031:   }
3032:   PetscFunctionReturn(PETSC_SUCCESS);
3033: }

3035: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3036: {
3037:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

3039:   PetscFunctionBegin;
3040:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3041:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3042:   if (destroy) {
3043:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3044:     delete cusp->csr2csc_i;
3045:     cusp->csr2csc_i = NULL;
3046:   }
3047:   A->transupdated = PETSC_FALSE;
3048:   PetscFunctionReturn(PETSC_SUCCESS);
3049: }

3051: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3052: {
3053:   PetscFunctionBegin;
3054:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3055:   PetscFunctionReturn(PETSC_SUCCESS);
3056: }

3058: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3059: {
3060:   PetscFunctionBegin;
3061:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3062:   PetscFunctionReturn(PETSC_SUCCESS);
3063: }

3065: /*@C
3066:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

3068:   Not Collective

3070:   Input Parameters:
3071: + A          - the matrix
3072: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

3074:   Output Parameters:
3075: + i - the CSR row pointers
3076: - j - the CSR column indices

3078:   Level: developer

3080:   Note:
3081:   When compressed is true, the CSR structure does not contain empty rows

3083: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
3084: @*/
3085: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3086: {
3087:   PetscFunctionBegin;
3088:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3089:   PetscFunctionReturn(PETSC_SUCCESS);
3090: }

3092: /*@C
3093:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

3095:   Not Collective

3097:   Input Parameters:
3098: + A          - the matrix
3099: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3100: . i          - the CSR row pointers
3101: - j          - the CSR column indices

3103:   Level: developer

3105: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
3106: @*/
3107: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3108: {
3109:   PetscFunctionBegin;
3110:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3111:   PetscFunctionReturn(PETSC_SUCCESS);
3112: }

3114: /*@C
3115:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored

3117:   Not Collective

3119:   Input Parameter:
3120: . A - a `MATSEQAIJCUSPARSE` matrix

3122:   Output Parameter:
3123: . a - pointer to the device data

3125:   Level: developer

3127:   Note:
3128:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

3130: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
3131: @*/
3132: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3133: {
3134:   return MatSeqAIJCUSPARSE_CUPM_t::GetArrayRead(A, a);
3135: }

3137: /*@C
3138:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

3140:   Not Collective

3142:   Input Parameters:
3143: + A - a `MATSEQAIJCUSPARSE` matrix
3144: - a - pointer to the device data

3146:   Level: developer

3148: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
3149: @*/
3150: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3151: {
3152:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayRead(A, a);
3153: }

3155: /*@C
3156:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

3158:   Not Collective

3160:   Input Parameter:
3161: . A - a `MATSEQAIJCUSPARSE` matrix

3163:   Output Parameter:
3164: . a - pointer to the device data

3166:   Level: developer

3168:   Note:
3169:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

3171: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
3172: @*/
3173: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
3174: {
3175:   return MatSeqAIJCUSPARSE_CUPM_t::GetArray(A, a);
3176: }
3177: /*@C
3178:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

3180:   Not Collective

3182:   Input Parameters:
3183: + A - a `MATSEQAIJCUSPARSE` matrix
3184: - a - pointer to the device data

3186:   Level: developer

3188: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
3189: @*/
3190: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
3191: {
3192:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArray(A, a);
3193: }

3195: /*@C
3196:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

3198:   Not Collective

3200:   Input Parameter:
3201: . A - a `MATSEQAIJCUSPARSE` matrix

3203:   Output Parameter:
3204: . a - pointer to the device data

3206:   Level: developer

3208:   Note:
3209:   Does not trigger any host to device copies.

3211:   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current

3213: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
3214: @*/
3215: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
3216: {
3217:   return MatSeqAIJCUSPARSE_CUPM_t::GetArrayWrite(A, a);
3218: }

3220: /*@C
3221:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

3223:   Not Collective

3225:   Input Parameters:
3226: + A - a `MATSEQAIJCUSPARSE` matrix
3227: - a - pointer to the device data

3229:   Level: developer

3231: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
3232: @*/
3233: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
3234: {
3235:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3236: }

3238: struct IJCompare4 {
3239:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt, PetscScalar, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt, PetscScalar, PetscInt> &t2)
3240:   {
3241:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
3242:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
3243:     return false;
3244:   }
3245: };

3247: struct Shift {
3248:   PetscInt _shift;

3250:   Shift(PetscInt shift) : _shift(shift) { }
3251:   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c + _shift; }
3252: };

3254: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3255: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3256: {
3257:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3258:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
3259:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3260:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3261:   PetscInt                      Annz, Bnnz;
3262:   PetscInt                      i, m, n, zero = 0;

3264:   PetscFunctionBegin;
3267:   PetscAssertPointer(C, 4);
3268:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3269:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
3270:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3271:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3272:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3273:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3274:   if (reuse == MAT_INITIAL_MATRIX) {
3275:     m = A->rmap->n;
3276:     n = A->cmap->n + B->cmap->n;
3277:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
3278:     PetscCall(MatSetSizes(*C, m, n, m, n));
3279:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
3280:     c                       = (Mat_SeqAIJ *)(*C)->data;
3281:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3282:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
3283:     Ccsr                    = new CsrMatrix;
3284:     Cmat->cprowIndices      = NULL;
3285:     c->compressedrow.use    = PETSC_FALSE;
3286:     c->compressedrow.nrows  = 0;
3287:     c->compressedrow.i      = NULL;
3288:     c->compressedrow.rindex = NULL;
3289:     Ccusp->workVector       = NULL;
3290:     Ccusp->nrows            = m;
3291:     Ccusp->mat              = Cmat;
3292:     Ccusp->mat->mat         = Ccsr;
3293:     Ccsr->num_rows          = m;
3294:     Ccsr->num_cols          = n;
3295:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3296:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3297:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3298:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3299:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3300:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3301:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3302:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3303:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3304:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3305:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3306:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3307:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

3309:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
3310:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
3311:     Annz                 = (PetscInt)Acsr->column_indices->size();
3312:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
3313:     c->nz                = Annz + Bnnz;
3314:     Ccsr->row_offsets    = new THRUSTINTARRAY(m + 1);
3315:     Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
3316:     Ccsr->values         = new THRUSTARRAY(c->nz);
3317:     Ccsr->num_entries    = c->nz;
3318:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
3319:     if (c->nz) {
3320:       auto            Acoo = new THRUSTINTARRAY(Annz); // initialized with zeros
3321:       auto            Bcoo = new THRUSTINTARRAY(Bnnz);
3322:       auto            Ccoo = new THRUSTINTARRAY(c->nz);
3323:       THRUSTINTARRAY *Aroff, *Broff;

3325:       if (a->compressedrow.use) { /* need full row offset */
3326:         if (!Acusp->rowoffsets_gpu) {
3327:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
3328:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3329:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3330:         }
3331:         Aroff = Acusp->rowoffsets_gpu;
3332:       } else Aroff = Acsr->row_offsets;
3333:       if (b->compressedrow.use) { /* need full row offset */
3334:         if (!Bcusp->rowoffsets_gpu) {
3335:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
3336:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3337:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3338:         }
3339:         Broff = Bcusp->rowoffsets_gpu;
3340:       } else Broff = Bcsr->row_offsets;
3341:       PetscCall(PetscLogGpuTimeBegin());
3342:       // Implement cusparseXcsr2coo() with Thrust, as the former doesn't support 64-bit indices.
3343:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Aroff->data().get(), Acoo->data().get())));
3344:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Broff->data().get(), Bcoo->data().get())));

3346:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3347:       auto Aperm = thrust::make_constant_iterator(1);
3348:       auto Bperm = thrust::make_constant_iterator(0);
3349:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3350:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3351:       auto wPerm = new THRUSTINTARRAY(Annz + Bnnz);
3352:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3353:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3354:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); // Use B column indices shifted by A->cmap->n
3355:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3356:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3357:       auto p1    = Ccusp->coords->begin();
3358:       auto p2    = Ccusp->coords->begin();
3359: #if CCCL_VERSION >= 3001000
3360:       cuda::std::advance(p2, Annz);
3361: #else
3362:       thrust::advance(p2, Annz);
3363: #endif
3364:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); // put nonzeros in A and B to C in sorted order (by row and then by column)
3365:       auto cci = thrust::make_counting_iterator(zero);
3366:       auto cce = thrust::make_counting_iterator(c->nz);
3367: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
3368:       auto pred = thrust::identity<int>();
3369: #else
3370:       auto pred = cuda::std::identity();
3371: #endif
3372:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3373:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3374:       // Implement a simplified cusparseXcoo2csr() with Thrust (assuming the row indices are already sorted), as the former doesn't support 64-bit indices.
3375:       PetscCallThrust(thrust::lower_bound(thrust::device, Ccoo->begin(), Ccoo->end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m + 1), Ccsr->row_offsets->begin()));
3376:       PetscCall(PetscLogGpuTimeEnd());
3377:       delete wPerm;
3378:       delete Acoo;
3379:       delete Bcoo;
3380:       delete Ccoo;
3381:       PetscCallCUSPARSE(cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3382:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3383:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3384:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3385:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3386:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3387:         CsrMatrix                    *CcsrT = new CsrMatrix;
3388:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3389:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

3391:         (*C)->form_explicit_transpose = PETSC_TRUE;
3392:         (*C)->transupdated            = PETSC_TRUE;
3393:         Ccusp->rowoffsets_gpu         = NULL;
3394:         CmatT->cprowIndices           = NULL;
3395:         CmatT->mat                    = CcsrT;
3396:         CcsrT->num_rows               = n;
3397:         CcsrT->num_cols               = m;
3398:         CcsrT->num_entries            = c->nz;

3400:         CcsrT->row_offsets    = new THRUSTINTARRAY(n + 1);
3401:         CcsrT->column_indices = new THRUSTINTARRAY(c->nz);
3402:         CcsrT->values         = new THRUSTARRAY(c->nz);

3404:         PetscCall(PetscLogGpuTimeBegin());
3405:         auto rT = CcsrT->row_offsets->begin();
3406:         if (AT) {
3407:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3408: #if CCCL_VERSION >= 3001000
3409:           cuda::std::advance(rT, -1);
3410: #else
3411:           thrust::advance(rT, -1);
3412: #endif
3413:         }
3414:         if (BT) {
3415:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3416:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3417:           thrust::copy(titb, tite, rT);
3418:         }
3419:         auto cT = CcsrT->column_indices->begin();
3420:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3421:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3422:         auto vT = CcsrT->values->begin();
3423:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3424:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3425:         PetscCall(PetscLogGpuTimeEnd());

3427:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
3428:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
3429:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3430:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3431:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3432:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3433:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3434:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3435:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3436:         PetscCallCUSPARSE(cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3437:         Ccusp->matTranspose = CmatT;
3438:       }
3439:     }

3441:     c->free_a = PETSC_TRUE;
3442:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3443:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3444:     c->free_ij = PETSC_TRUE;
3445:     PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3446:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3447:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3448:     PetscCall(PetscMalloc1(m, &c->ilen));
3449:     PetscCall(PetscMalloc1(m, &c->imax));
3450:     c->maxnz         = c->nz;
3451:     c->nonzerorowcnt = 0;
3452:     c->rmax          = 0;
3453:     for (i = 0; i < m; i++) {
3454:       const PetscInt nn = c->i[i + 1] - c->i[i];
3455:       c->ilen[i] = c->imax[i] = nn;
3456:       c->nonzerorowcnt += (PetscInt)!!nn;
3457:       c->rmax = PetscMax(c->rmax, nn);
3458:     }
3459:     PetscCall(PetscMalloc1(c->nz, &c->a));
3460:     (*C)->nonzerostate++;
3461:     PetscCall(PetscLayoutSetUp((*C)->rmap));
3462:     PetscCall(PetscLayoutSetUp((*C)->cmap));
3463:     Ccusp->nonzerostate = (*C)->nonzerostate;
3464:     (*C)->preallocated  = PETSC_TRUE;
3465:   } else {
3466:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3467:     c = (Mat_SeqAIJ *)(*C)->data;
3468:     if (c->nz) {
3469:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3470:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3471:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3472:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3473:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3474:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3475:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3476:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3477:       Acsr = (CsrMatrix *)Acusp->mat->mat;
3478:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3479:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3480:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3481:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3482:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3483:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3484:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3485:       auto pmid = Ccusp->coords->begin();
3486: #if CCCL_VERSION >= 3001000
3487:       cuda::std::advance(pmid, Acsr->num_entries);
3488: #else
3489:       thrust::advance(pmid, Acsr->num_entries);
3490: #endif
3491:       PetscCall(PetscLogGpuTimeBegin());
3492:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3493:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3494:       thrust::for_each(zibait, zieait, VecCUDAEquals());
3495:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3496:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3497:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
3498:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3499:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3500:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3501:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3502:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3503:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3504:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3505:         auto       vT    = CcsrT->values->begin();
3506:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3507:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3508:         (*C)->transupdated = PETSC_TRUE;
3509:       }
3510:       PetscCall(PetscLogGpuTimeEnd());
3511:     }
3512:   }
3513:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3514:   (*C)->assembled     = PETSC_TRUE;
3515:   (*C)->was_assembled = PETSC_FALSE;
3516:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
3517:   PetscFunctionReturn(PETSC_SUCCESS);
3518: }

3520: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3521: {
3522:   PetscFunctionBegin;
3523:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3524:   PetscFunctionReturn(PETSC_SUCCESS);
3525: }