Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18: #endif
 19: #include <thrust/iterator/constant_iterator.h>
 20: #include <thrust/remove.h>
 21: #include <thrust/sort.h>
 22: #include <thrust/unique.h>
 23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
 24:   #include <cuda/std/functional>
 25: #endif

 27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 29: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 30:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 32:   typedef enum {
 33:       CUSPARSE_MV_ALG_DEFAULT = 0,
 34:       CUSPARSE_COOMV_ALG      = 1,
 35:       CUSPARSE_CSRMV_ALG1     = 2,
 36:       CUSPARSE_CSRMV_ALG2     = 3
 37:   } cusparseSpMVAlg_t;

 39:   typedef enum {
 40:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 41:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 42:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 43:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 44:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 45:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 46:       CUSPARSE_SPMM_COO_ALG1    = 1,
 47:       CUSPARSE_SPMM_COO_ALG2    = 2,
 48:       CUSPARSE_SPMM_COO_ALG3    = 3,
 49:       CUSPARSE_SPMM_COO_ALG4    = 5,
 50:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 51:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 52:   } cusparseSpMMAlg_t;

 54:   typedef enum {
 55:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
 56:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
 57:   } cusparseCsr2CscAlg_t;
 58:   */
 59: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 60: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 61: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 62: #endif

 64: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 65: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 66: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 67: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 68: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
 69: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 72: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 73: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 74: #endif
 75: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 76: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 77: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 78: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 79: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 80: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 81: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 82: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 83: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 84: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 86: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 88: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 89: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 91: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 92: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 94: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 95: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 96: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 98: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 99: {
100:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

102:   PetscFunctionBegin;
103:   switch (op) {
104:   case MAT_CUSPARSE_MULT:
105:     cusparsestruct->format = format;
106:     break;
107:   case MAT_CUSPARSE_ALL:
108:     cusparsestruct->format = format;
109:     break;
110:   default:
111:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112:   }
113:   PetscFunctionReturn(PETSC_SUCCESS);
114: }

116: /*@
117:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118:   operation. Only the `MatMult()` operation can use different GPU storage formats

120:   Not Collective

122:   Input Parameters:
123: + A      - Matrix of type `MATSEQAIJCUSPARSE`
124: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125:         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

128:   Level: intermediate

130: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131: @*/
132: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133: {
134:   PetscFunctionBegin;
136:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141: {
142:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

144:   PetscFunctionBegin;
145:   cusparsestruct->use_cpu_solve = use_cpu;
146:   PetscFunctionReturn(PETSC_SUCCESS);
147: }

149: /*@
150:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

152:   Input Parameters:
153: + A       - Matrix of type `MATSEQAIJCUSPARSE`
154: - use_cpu - set flag for using the built-in CPU `MatSolve()`

156:   Level: intermediate

158:   Note:
159:   The cuSparse LU solver currently computes the factors with the built-in CPU method
160:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

163: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164: @*/
165: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166: {
167:   PetscFunctionBegin;
169:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170:   PetscFunctionReturn(PETSC_SUCCESS);
171: }

173: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174: {
175:   PetscFunctionBegin;
176:   switch (op) {
177:   case MAT_FORM_EXPLICIT_TRANSPOSE:
178:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180:     A->form_explicit_transpose = flg;
181:     break;
182:   default:
183:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184:     break;
185:   }
186:   PetscFunctionReturn(PETSC_SUCCESS);
187: }

189: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
190: {
191:   MatCUSPARSEStorageFormat format;
192:   PetscBool                flg;
193:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

195:   PetscFunctionBegin;
196:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
197:   if (A->factortype == MAT_FACTOR_NONE) {
198:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
199:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

201:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
202:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
203:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
204:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
205: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
206:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
207:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
208:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
209:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210:   #else
211:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
212:   #endif
213:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
214:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

216:     PetscCall(
217:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
218:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
219: #endif
220:   }
221:   PetscOptionsHeadEnd();
222:   PetscFunctionReturn(PETSC_SUCCESS);
223: }

225: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
226: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
227: {
228:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
229:   PetscInt                      m  = A->rmap->n;
230:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
231:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
232:   const MatScalar              *Aa = a->a;
233:   PetscInt                     *Mi, *Mj, Mnz;
234:   PetscScalar                  *Ma;

236:   PetscFunctionBegin;
237:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
238:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
239:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
240:       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
241:       PetscCall(PetscMalloc1(m + 1, &Mi));
242:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
243:       PetscCall(PetscMalloc1(Mnz, &Ma));
244:       Mi[0] = 0;
245:       for (PetscInt i = 0; i < m; i++) {
246:         PetscInt llen = Ai[i + 1] - Ai[i];
247:         PetscInt ulen = Adiag[i] - Adiag[i + 1];
248:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
249:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
250:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
251:         Mi[i + 1] = Mi[i] + llen + ulen;
252:       }
253:       // Copy M (L,U) from host to device
254:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
255:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
256:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
257:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
258:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

260:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
261:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
262:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
263:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
264:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
265:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
266:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
267:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

269:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
270:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
271:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

273:       fillMode = CUSPARSE_FILL_MODE_UPPER;
274:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
275:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
276:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
277:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

279:       // Allocate work vectors in SpSv
280:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
281:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

283:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
284:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

286:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
287:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
288:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
289:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
290:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
291:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
292:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

294:       // Record for reuse
295:       fs->csrRowPtr_h = Mi;
296:       fs->csrVal_h    = Ma;
297:       PetscCall(PetscFree(Mj));
298:     }
299:     // Copy the value
300:     Mi  = fs->csrRowPtr_h;
301:     Ma  = fs->csrVal_h;
302:     Mnz = Mi[m];
303:     for (PetscInt i = 0; i < m; i++) {
304:       PetscInt llen = Ai[i + 1] - Ai[i];
305:       PetscInt ulen = Adiag[i] - Adiag[i + 1];
306:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
307:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
308:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
309:     }
310:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

312:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
313:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
314:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
315:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317:     } else
318:   #endif
319:     {
320:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
321:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

323:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
324:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
325:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
326:     }
327:   }
328:   PetscFunctionReturn(PETSC_SUCCESS);
329: }
330: #else
331: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
332: {
333:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
334:   PetscInt                           n                  = A->rmap->n;
335:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
336:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
337:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
338:   const MatScalar                   *aa = a->a, *v;
339:   PetscInt                          *AiLo, *AjLo;
340:   PetscInt                           i, nz, nzLower, offset, rowOffset;

342:   PetscFunctionBegin;
343:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
344:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
345:     try {
346:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
347:       nzLower = n + ai[n] - ai[1];
348:       if (!loTriFactor) {
349:         PetscScalar *AALo;

351:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

353:         /* Allocate Space for the lower triangular matrix */
354:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
355:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

357:         /* Fill the lower triangular matrix */
358:         AiLo[0]   = (PetscInt)0;
359:         AiLo[n]   = nzLower;
360:         AjLo[0]   = (PetscInt)0;
361:         AALo[0]   = (MatScalar)1.0;
362:         v         = aa;
363:         vi        = aj;
364:         offset    = 1;
365:         rowOffset = 1;
366:         for (i = 1; i < n; i++) {
367:           nz = ai[i + 1] - ai[i];
368:           /* additional 1 for the term on the diagonal */
369:           AiLo[i] = rowOffset;
370:           rowOffset += nz + 1;

372:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
373:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));

375:           offset += nz;
376:           AjLo[offset] = (PetscInt)i;
377:           AALo[offset] = (MatScalar)1.0;
378:           offset += 1;

380:           v += nz;
381:           vi += nz;
382:         }

384:         /* allocate space for the triangular factor information */
385:         PetscCall(PetscNew(&loTriFactor));
386:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
387:         /* Create the matrix description */
388:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
389:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
390:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
391:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
392:   #else
393:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
394:   #endif
395:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
396:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

398:         /* set the operation */
399:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

401:         /* set the matrix */
402:         loTriFactor->csrMat              = new CsrMatrix;
403:         loTriFactor->csrMat->num_rows    = n;
404:         loTriFactor->csrMat->num_cols    = n;
405:         loTriFactor->csrMat->num_entries = nzLower;

407:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
408:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

410:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
411:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

413:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
414:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

416:         /* Create the solve analysis information */
417:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
418:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
419:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
420:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
421:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
422:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
423:   #endif

425:         /* perform the solve analysis */
426:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
427:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
428:         PetscCallCUDA(WaitForCUDA());
429:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

431:         /* assign the pointer */
432:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
433:         loTriFactor->AA_h                                          = AALo;
434:         PetscCallCUDA(cudaFreeHost(AiLo));
435:         PetscCallCUDA(cudaFreeHost(AjLo));
436:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
437:       } else { /* update values only */
438:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
439:         /* Fill the lower triangular matrix */
440:         loTriFactor->AA_h[0] = 1.0;
441:         v                    = aa;
442:         vi                   = aj;
443:         offset               = 1;
444:         for (i = 1; i < n; i++) {
445:           nz = ai[i + 1] - ai[i];
446:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
447:           offset += nz;
448:           loTriFactor->AA_h[offset] = 1.0;
449:           offset += 1;
450:           v += nz;
451:         }
452:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
453:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
454:       }
455:     } catch (char *ex) {
456:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
457:     }
458:   }
459:   PetscFunctionReturn(PETSC_SUCCESS);
460: }

462: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
463: {
464:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
465:   PetscInt                           n                  = A->rmap->n;
466:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
467:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
468:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
469:   const MatScalar                   *aa = a->a, *v;
470:   PetscInt                          *AiUp, *AjUp;
471:   PetscInt                           i, nz, nzUpper, offset;

473:   PetscFunctionBegin;
474:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
475:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
476:     try {
477:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
478:       nzUpper = adiag[0] - adiag[n];
479:       if (!upTriFactor) {
480:         PetscScalar *AAUp;

482:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

484:         /* Allocate Space for the upper triangular matrix */
485:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
486:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

488:         /* Fill the upper triangular matrix */
489:         AiUp[0] = (PetscInt)0;
490:         AiUp[n] = nzUpper;
491:         offset  = nzUpper;
492:         for (i = n - 1; i >= 0; i--) {
493:           v  = aa + adiag[i + 1] + 1;
494:           vi = aj + adiag[i + 1] + 1;

496:           /* number of elements NOT on the diagonal */
497:           nz = adiag[i] - adiag[i + 1] - 1;

499:           /* decrement the offset */
500:           offset -= (nz + 1);

502:           /* first, set the diagonal elements */
503:           AjUp[offset] = (PetscInt)i;
504:           AAUp[offset] = (MatScalar)1. / v[nz];
505:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

507:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
508:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
509:         }

511:         /* allocate space for the triangular factor information */
512:         PetscCall(PetscNew(&upTriFactor));
513:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

515:         /* Create the matrix description */
516:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
517:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
518:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
519:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
520:   #else
521:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
522:   #endif
523:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
524:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

526:         /* set the operation */
527:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

529:         /* set the matrix */
530:         upTriFactor->csrMat              = new CsrMatrix;
531:         upTriFactor->csrMat->num_rows    = n;
532:         upTriFactor->csrMat->num_cols    = n;
533:         upTriFactor->csrMat->num_entries = nzUpper;

535:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
536:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

538:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
539:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

541:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
542:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

544:         /* Create the solve analysis information */
545:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
546:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
547:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
548:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
549:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
550:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
551:   #endif

553:         /* perform the solve analysis */
554:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
555:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

557:         PetscCallCUDA(WaitForCUDA());
558:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

560:         /* assign the pointer */
561:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
562:         upTriFactor->AA_h                                          = AAUp;
563:         PetscCallCUDA(cudaFreeHost(AiUp));
564:         PetscCallCUDA(cudaFreeHost(AjUp));
565:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
566:       } else {
567:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
568:         /* Fill the upper triangular matrix */
569:         offset = nzUpper;
570:         for (i = n - 1; i >= 0; i--) {
571:           v = aa + adiag[i + 1] + 1;

573:           /* number of elements NOT on the diagonal */
574:           nz = adiag[i] - adiag[i + 1] - 1;

576:           /* decrement the offset */
577:           offset -= (nz + 1);

579:           /* first, set the diagonal elements */
580:           upTriFactor->AA_h[offset] = 1. / v[nz];
581:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
582:         }
583:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
584:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
585:       }
586:     } catch (char *ex) {
587:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
588:     }
589:   }
590:   PetscFunctionReturn(PETSC_SUCCESS);
591: }
592: #endif

594: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
595: {
596:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
597:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
598:   IS                            isrow = a->row, isicol = a->icol;
599:   PetscBool                     row_identity, col_identity;
600:   PetscInt                      n = A->rmap->n;

602:   PetscFunctionBegin;
603:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
604: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
605:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
606: #else
607:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
608:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
609:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
610: #endif

612:   cusparseTriFactors->nnz = a->nz;

614:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
615:   /* lower triangular indices */
616:   PetscCall(ISIdentity(isrow, &row_identity));
617:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
618:     const PetscInt *r;

620:     PetscCall(ISGetIndices(isrow, &r));
621:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
622:     cusparseTriFactors->rpermIndices->assign(r, r + n);
623:     PetscCall(ISRestoreIndices(isrow, &r));
624:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
625:   }

627:   /* upper triangular indices */
628:   PetscCall(ISIdentity(isicol, &col_identity));
629:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
630:     const PetscInt *c;

632:     PetscCall(ISGetIndices(isicol, &c));
633:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
634:     cusparseTriFactors->cpermIndices->assign(c, c + n);
635:     PetscCall(ISRestoreIndices(isicol, &c));
636:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
637:   }
638:   PetscFunctionReturn(PETSC_SUCCESS);
639: }

641: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
642: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
643: {
644:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
645:   PetscInt                      m  = A->rmap->n;
646:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
647:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
648:   const MatScalar              *Aa = a->a;
649:   PetscInt                     *Mj, Mnz;
650:   PetscScalar                  *Ma, *D;

652:   PetscFunctionBegin;
653:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
654:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
655:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
656:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
657:       Mnz = Ai[m]; // Unz (with the unit diagonal)
658:       PetscCall(PetscMalloc1(Mnz, &Ma));
659:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
660:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
661:       for (PetscInt i = 0; i < m; i++) {
662:         PetscInt ulen = Ai[i + 1] - Ai[i];
663:         Mj[Ai[i]]     = i;                                              // diagonal entry
664:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
665:       }
666:       // Copy M (U) from host to device
667:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
668:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
669:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
670:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
671:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
672:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

674:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
675:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
676:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
677:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
678:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
679:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
680:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
681:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

683:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
684:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
685:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

687:       // Allocate work vectors in SpSv
688:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
689:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

691:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
692:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

694:       // Query buffer sizes for SpSV and then allocate buffers
695:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
696:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
697:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

699:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
700:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
701:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

703:       // Record for reuse
704:       fs->csrVal_h = Ma;
705:       fs->diag_h   = D;
706:       PetscCall(PetscFree(Mj));
707:     }
708:     // Copy the value
709:     Ma  = fs->csrVal_h;
710:     D   = fs->diag_h;
711:     Mnz = Ai[m];
712:     for (PetscInt i = 0; i < m; i++) {
713:       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
714:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
715:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
716:     }
717:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
718:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

720:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
721:     if (fs->updatedSpSVAnalysis) {
722:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724:     } else
725:   #endif
726:     {
727:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
728:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
729:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
730:       fs->updatedSpSVAnalysis = PETSC_TRUE;
731:     }
732:   }
733:   PetscFunctionReturn(PETSC_SUCCESS);
734: }

736: // Solve Ut D U x = b
737: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
738: {
739:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
740:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
741:   const PetscScalar                    *barray;
742:   PetscScalar                          *xarray;
743:   thrust::device_ptr<const PetscScalar> bGPU;
744:   thrust::device_ptr<PetscScalar>       xGPU;
745:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
746:   PetscInt                              m   = A->rmap->n;

748:   PetscFunctionBegin;
749:   PetscCall(PetscLogGpuTimeBegin());
750:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
751:   PetscCall(VecCUDAGetArrayRead(b, &barray));
752:   xGPU = thrust::device_pointer_cast(xarray);
753:   bGPU = thrust::device_pointer_cast(barray);

755:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
756:   if (fs->rpermIndices) {
757:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
758:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
759:   } else {
760:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
761:   }

763:   // Solve Ut Y = X
764:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
765:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

767:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
768:   // It is basically a vector element-wise multiplication, but cublas does not have it!
769:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));

771:   // Solve U X = Y
772:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
773:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
774:   } else {
775:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
776:   }
777:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

779:   // Reorder X with the column permutation if needed, and put the result back to x
780:   if (fs->cpermIndices) {
781:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
782:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
783:   }

785:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
786:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
787:   PetscCall(PetscLogGpuTimeEnd());
788:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
789:   PetscFunctionReturn(PETSC_SUCCESS);
790: }
791: #else
792: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
793: {
794:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
795:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
796:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
797:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
798:   PetscInt                          *AiUp, *AjUp;
799:   PetscScalar                       *AAUp;
800:   PetscScalar                       *AALo;
801:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
802:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
803:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
804:   const MatScalar                   *aa = b->a, *v;

806:   PetscFunctionBegin;
807:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
808:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
809:     try {
810:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
811:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
812:       if (!upTriFactor && !loTriFactor) {
813:         /* Allocate Space for the upper triangular matrix */
814:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
815:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

817:         /* Fill the upper triangular matrix */
818:         AiUp[0] = (PetscInt)0;
819:         AiUp[n] = nzUpper;
820:         offset  = 0;
821:         for (i = 0; i < n; i++) {
822:           /* set the pointers */
823:           v  = aa + ai[i];
824:           vj = aj + ai[i];
825:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

827:           /* first, set the diagonal elements */
828:           AjUp[offset] = (PetscInt)i;
829:           AAUp[offset] = (MatScalar)1.0 / v[nz];
830:           AiUp[i]      = offset;
831:           AALo[offset] = (MatScalar)1.0 / v[nz];

833:           offset += 1;
834:           if (nz > 0) {
835:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
836:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
837:             for (j = offset; j < offset + nz; j++) {
838:               AAUp[j] = -AAUp[j];
839:               AALo[j] = AAUp[j] / v[nz];
840:             }
841:             offset += nz;
842:           }
843:         }

845:         /* allocate space for the triangular factor information */
846:         PetscCall(PetscNew(&upTriFactor));
847:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

849:         /* Create the matrix description */
850:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
851:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
852:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
853:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
854:   #else
855:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
856:   #endif
857:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
858:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

860:         /* set the matrix */
861:         upTriFactor->csrMat              = new CsrMatrix;
862:         upTriFactor->csrMat->num_rows    = A->rmap->n;
863:         upTriFactor->csrMat->num_cols    = A->cmap->n;
864:         upTriFactor->csrMat->num_entries = a->nz;

866:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
867:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

869:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
870:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

872:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
873:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

875:         /* set the operation */
876:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

878:         /* Create the solve analysis information */
879:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
880:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
881:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
883:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
884:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
885:   #endif

887:         /* perform the solve analysis */
888:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
889:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

891:         PetscCallCUDA(WaitForCUDA());
892:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

894:         /* assign the pointer */
895:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

897:         /* allocate space for the triangular factor information */
898:         PetscCall(PetscNew(&loTriFactor));
899:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

901:         /* Create the matrix description */
902:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
903:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
904:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
905:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
906:   #else
907:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
908:   #endif
909:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
910:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

912:         /* set the operation */
913:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

915:         /* set the matrix */
916:         loTriFactor->csrMat              = new CsrMatrix;
917:         loTriFactor->csrMat->num_rows    = A->rmap->n;
918:         loTriFactor->csrMat->num_cols    = A->cmap->n;
919:         loTriFactor->csrMat->num_entries = a->nz;

921:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
922:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

924:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
925:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

927:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
928:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

930:         /* Create the solve analysis information */
931:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
932:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
933:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
934:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
935:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
936:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
937:   #endif

939:         /* perform the solve analysis */
940:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
941:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

943:         PetscCallCUDA(WaitForCUDA());
944:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

946:         /* assign the pointer */
947:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

949:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
950:         PetscCallCUDA(cudaFreeHost(AiUp));
951:         PetscCallCUDA(cudaFreeHost(AjUp));
952:       } else {
953:         /* Fill the upper triangular matrix */
954:         offset = 0;
955:         for (i = 0; i < n; i++) {
956:           /* set the pointers */
957:           v  = aa + ai[i];
958:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

960:           /* first, set the diagonal elements */
961:           AAUp[offset] = 1.0 / v[nz];
962:           AALo[offset] = 1.0 / v[nz];

964:           offset += 1;
965:           if (nz > 0) {
966:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
967:             for (j = offset; j < offset + nz; j++) {
968:               AAUp[j] = -AAUp[j];
969:               AALo[j] = AAUp[j] / v[nz];
970:             }
971:             offset += nz;
972:           }
973:         }
974:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
977:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
978:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
979:       }
980:       PetscCallCUDA(cudaFreeHost(AAUp));
981:       PetscCallCUDA(cudaFreeHost(AALo));
982:     } catch (char *ex) {
983:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
984:     }
985:   }
986:   PetscFunctionReturn(PETSC_SUCCESS);
987: }
988: #endif

990: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
991: {
992:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
993:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
994:   IS                            ip                 = a->row;
995:   PetscBool                     perm_identity;
996:   PetscInt                      n = A->rmap->n;

998:   PetscFunctionBegin;
999:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

1001: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1002:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1003: #else
1004:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1005:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1006: #endif
1007:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

1009:   A->offloadmask = PETSC_OFFLOAD_BOTH;

1011:   /* lower triangular indices */
1012:   PetscCall(ISIdentity(ip, &perm_identity));
1013:   if (!perm_identity) {
1014:     IS              iip;
1015:     const PetscInt *irip, *rip;

1017:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1018:     PetscCall(ISGetIndices(iip, &irip));
1019:     PetscCall(ISGetIndices(ip, &rip));
1020:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1021:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1022:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1023:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1024:     PetscCall(ISRestoreIndices(iip, &irip));
1025:     PetscCall(ISDestroy(&iip));
1026:     PetscCall(ISRestoreIndices(ip, &rip));
1027:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1028:   }
1029:   PetscFunctionReturn(PETSC_SUCCESS);
1030: }

1032: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1033: {
1034:   PetscFunctionBegin;
1035:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1036:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1037:   B->offloadmask = PETSC_OFFLOAD_CPU;

1039: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1040:   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041:   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042: #else
1043:   /* determine which version of MatSolve needs to be used. */
1044:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1045:   IS          ip = b->row;
1046:   PetscBool   perm_identity;

1048:   PetscCall(ISIdentity(ip, &perm_identity));
1049:   if (perm_identity) {
1050:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1051:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1052:   } else {
1053:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1054:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1055:   }
1056: #endif
1057:   B->ops->matsolve          = NULL;
1058:   B->ops->matsolvetranspose = NULL;

1060:   /* get the triangular factors */
1061:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1062:   PetscFunctionReturn(PETSC_SUCCESS);
1063: }

1065: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1066: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1067: {
1068:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1069:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1070:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1071:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1072:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1073:   cusparseIndexBase_t                indexBase;
1074:   cusparseMatrixType_t               matrixType;
1075:   cusparseFillMode_t                 fillMode;
1076:   cusparseDiagType_t                 diagType;

1078:   PetscFunctionBegin;
1079:   /* allocate space for the transpose of the lower triangular factor */
1080:   PetscCall(PetscNew(&loTriFactorT));
1081:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1083:   /* set the matrix descriptors of the lower triangular factor */
1084:   matrixType = cusparseGetMatType(loTriFactor->descr);
1085:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1086:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1087:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

1089:   /* Create the matrix description */
1090:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1091:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1092:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1093:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1094:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

1096:   /* set the operation */
1097:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1099:   /* allocate GPU space for the CSC of the lower triangular factor*/
1100:   loTriFactorT->csrMat                 = new CsrMatrix;
1101:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1102:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1103:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1104:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1105:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1106:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1108:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1109:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1110:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1111:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1112:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1113:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1114:   #endif

1116:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1117:   {
1118:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1119:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1120:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1121:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1122:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1123:   #else
1124:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1125:   #endif
1126:     PetscCallCUSPARSE(stat);
1127:   }

1129:   PetscCallCUDA(WaitForCUDA());
1130:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1132:   /* Create the solve analysis information */
1133:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1134:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1135:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1136:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1137:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1138:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1139:   #endif

1141:   /* perform the solve analysis */
1142:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1143:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1145:   PetscCallCUDA(WaitForCUDA());
1146:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1148:   /* assign the pointer */
1149:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1151:   /*********************************************/
1152:   /* Now the Transpose of the Upper Tri Factor */
1153:   /*********************************************/

1155:   /* allocate space for the transpose of the upper triangular factor */
1156:   PetscCall(PetscNew(&upTriFactorT));
1157:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1159:   /* set the matrix descriptors of the upper triangular factor */
1160:   matrixType = cusparseGetMatType(upTriFactor->descr);
1161:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1162:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1163:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

1165:   /* Create the matrix description */
1166:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1167:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1168:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1169:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1170:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

1172:   /* set the operation */
1173:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1175:   /* allocate GPU space for the CSC of the upper triangular factor*/
1176:   upTriFactorT->csrMat                 = new CsrMatrix;
1177:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1178:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1179:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1180:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1181:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1182:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1184:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1185:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1186:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1187:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1188:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1189:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1190:   #endif

1192:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1193:   {
1194:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1195:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1196:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1197:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1198:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1199:   #else
1200:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1201:   #endif
1202:     PetscCallCUSPARSE(stat);
1203:   }

1205:   PetscCallCUDA(WaitForCUDA());
1206:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1208:   /* Create the solve analysis information */
1209:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1210:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1211:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1212:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1213:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1214:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1215:   #endif

1217:   /* perform the solve analysis */
1218:   /* christ, would it have killed you to put this stuff in a function????????? */
1219:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1220:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1222:   PetscCallCUDA(WaitForCUDA());
1223:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1225:   /* assign the pointer */
1226:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227:   PetscFunctionReturn(PETSC_SUCCESS);
1228: }
1229: #endif

1231: struct PetscScalarToPetscInt {
1232:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1233: };

1235: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1236: {
1237:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1238:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1239:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1240:   cusparseStatus_t              stat;
1241:   cusparseIndexBase_t           indexBase;

1243:   PetscFunctionBegin;
1244:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1245:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1246:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1247:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1248:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1249:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1250:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1251:   PetscCall(PetscLogGpuTimeBegin());
1252:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1253:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1254:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1255:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1256:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1257:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1258:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1260:     /* set alpha and beta */
1261:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1262:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1263:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1264:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1265:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1268:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1269:       CsrMatrix *matrixT      = new CsrMatrix;
1270:       matstructT->mat         = matrixT;
1271:       matrixT->num_rows       = A->cmap->n;
1272:       matrixT->num_cols       = A->rmap->n;
1273:       matrixT->num_entries    = a->nz;
1274:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1275:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1276:       matrixT->values         = new THRUSTARRAY(a->nz);

1278:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1279:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1281: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1282:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1283:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1284:                                indexBase, cusparse_scalartype);
1285:       PetscCallCUSPARSE(stat);
1286:   #else
1287:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1288:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1290:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1291:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1292:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1293:         */
1294:       if (matrixT->num_entries) {
1295:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1296:         PetscCallCUSPARSE(stat);

1298:       } else {
1299:         matstructT->matDescr = NULL;
1300:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1301:       }
1302:   #endif
1303: #endif
1304:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1305: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1306:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1307: #else
1308:       CsrMatrix *temp  = new CsrMatrix;
1309:       CsrMatrix *tempT = new CsrMatrix;
1310:       /* First convert HYB to CSR */
1311:       temp->num_rows       = A->rmap->n;
1312:       temp->num_cols       = A->cmap->n;
1313:       temp->num_entries    = a->nz;
1314:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1315:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1316:       temp->values         = new THRUSTARRAY(a->nz);

1318:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1319:       PetscCallCUSPARSE(stat);

1321:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1322:       tempT->num_rows       = A->rmap->n;
1323:       tempT->num_cols       = A->cmap->n;
1324:       tempT->num_entries    = a->nz;
1325:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1326:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1327:       tempT->values         = new THRUSTARRAY(a->nz);

1329:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1330:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1331:       PetscCallCUSPARSE(stat);

1333:       /* Last, convert CSC to HYB */
1334:       cusparseHybMat_t hybMat;
1335:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1336:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1337:       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1338:       PetscCallCUSPARSE(stat);

1340:       /* assign the pointer */
1341:       matstructT->mat = hybMat;
1342:       A->transupdated = PETSC_TRUE;
1343:       /* delete temporaries */
1344:       if (tempT) {
1345:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1346:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1347:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1348:         delete (CsrMatrix *)tempT;
1349:       }
1350:       if (temp) {
1351:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1352:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1353:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1354:         delete (CsrMatrix *)temp;
1355:       }
1356: #endif
1357:     }
1358:   }
1359:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1360:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1361:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1362:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1363:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1364:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1365:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1366:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1367:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1368:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1369:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1370:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1371:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1372:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1373:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1374:     }
1375:     if (!cusparsestruct->csr2csc_i) {
1376:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1377:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1379:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1380: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381:       void  *csr2cscBuffer;
1382:       size_t csr2cscBufferSize;
1383:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1384:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1385:       PetscCallCUSPARSE(stat);
1386:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1387: #endif

1389:       if (matrix->num_entries) {
1390:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1391:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1392:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1394:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1395:            should be filled with indexBase. So I just take a shortcut here.
1396:         */
1397:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1398: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1399:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1400:         PetscCallCUSPARSE(stat);
1401: #else
1402:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1403:         PetscCallCUSPARSE(stat);
1404: #endif
1405:       } else {
1406:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1407:       }

1409:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1410:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1411: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1412:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1413: #endif
1414:     }
1415:     PetscCallThrust(
1416:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1417:   }
1418:   PetscCall(PetscLogGpuTimeEnd());
1419:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1420:   /* the compressed row indices is not used for matTranspose */
1421:   matstructT->cprowIndices = NULL;
1422:   /* assign the pointer */
1423:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1424:   A->transupdated                                = PETSC_TRUE;
1425:   PetscFunctionReturn(PETSC_SUCCESS);
1426: }

1428: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1429: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1430: {
1431:   const PetscScalar                    *barray;
1432:   PetscScalar                          *xarray;
1433:   thrust::device_ptr<const PetscScalar> bGPU;
1434:   thrust::device_ptr<PetscScalar>       xGPU;
1435:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1436:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1437:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1438:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1439:   PetscInt                              m   = A->rmap->n;

1441:   PetscFunctionBegin;
1442:   PetscCall(PetscLogGpuTimeBegin());
1443:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1444:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1445:   xGPU = thrust::device_pointer_cast(xarray);
1446:   bGPU = thrust::device_pointer_cast(barray);

1448:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1449:   if (fs->rpermIndices) {
1450:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1451:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1452:   } else {
1453:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1454:   }

1456:   // Solve L Y = X
1457:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1458:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1459:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

1461:   // Solve U X = Y
1462:   if (fs->cpermIndices) {
1463:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1464:   } else {
1465:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1466:   }
1467:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

1469:   // Reorder X with the column permutation if needed, and put the result back to x
1470:   if (fs->cpermIndices) {
1471:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1472:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1473:   }
1474:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1475:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1476:   PetscCall(PetscLogGpuTimeEnd());
1477:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1478:   PetscFunctionReturn(PETSC_SUCCESS);
1479: }

1481: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1482: {
1483:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1484:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1485:   const PetscScalar                    *barray;
1486:   PetscScalar                          *xarray;
1487:   thrust::device_ptr<const PetscScalar> bGPU;
1488:   thrust::device_ptr<PetscScalar>       xGPU;
1489:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1490:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1491:   PetscInt                              m   = A->rmap->n;

1493:   PetscFunctionBegin;
1494:   PetscCall(PetscLogGpuTimeBegin());
1495:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1496:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1497:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1498:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1500:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1501:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1502:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1503:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1504:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1505:   }

1507:   if (!fs->updatedTransposeSpSVAnalysis) {
1508:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1510:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1511:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1512:   }

1514:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1515:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1516:   xGPU = thrust::device_pointer_cast(xarray);
1517:   bGPU = thrust::device_pointer_cast(barray);

1519:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1520:   if (fs->rpermIndices) {
1521:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1522:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1523:   } else {
1524:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1525:   }

1527:   // Solve Ut Y = X
1528:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1529:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

1531:   // Solve Lt X = Y
1532:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1533:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1534:   } else {
1535:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1536:   }
1537:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

1539:   // Reorder X with the column permutation if needed, and put the result back to x
1540:   if (fs->cpermIndices) {
1541:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1542:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1543:   }

1545:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1546:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1547:   PetscCall(PetscLogGpuTimeEnd());
1548:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1549:   PetscFunctionReturn(PETSC_SUCCESS);
1550: }
1551: #else
1552: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1553: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1554: {
1555:   PetscInt                              n = xx->map->n;
1556:   const PetscScalar                    *barray;
1557:   PetscScalar                          *xarray;
1558:   thrust::device_ptr<const PetscScalar> bGPU;
1559:   thrust::device_ptr<PetscScalar>       xGPU;
1560:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1561:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1562:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1563:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1565:   PetscFunctionBegin;
1566:   /* Analyze the matrix and create the transpose ... on the fly */
1567:   if (!loTriFactorT && !upTriFactorT) {
1568:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1570:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1571:   }

1573:   /* Get the GPU pointers */
1574:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1575:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1576:   xGPU = thrust::device_pointer_cast(xarray);
1577:   bGPU = thrust::device_pointer_cast(barray);

1579:   PetscCall(PetscLogGpuTimeBegin());
1580:   /* First, reorder with the row permutation */
1581:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1583:   /* First, solve U */
1584:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1585:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1587:   /* Then, solve L */
1588:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1589:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1591:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1592:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1594:   /* Copy the temporary to the full solution. */
1595:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1597:   /* restore */
1598:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1599:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1600:   PetscCall(PetscLogGpuTimeEnd());
1601:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1602:   PetscFunctionReturn(PETSC_SUCCESS);
1603: }

1605: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1606: {
1607:   const PetscScalar                 *barray;
1608:   PetscScalar                       *xarray;
1609:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1610:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1611:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1612:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1614:   PetscFunctionBegin;
1615:   /* Analyze the matrix and create the transpose ... on the fly */
1616:   if (!loTriFactorT && !upTriFactorT) {
1617:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1618:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1619:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1620:   }

1622:   /* Get the GPU pointers */
1623:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1624:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1626:   PetscCall(PetscLogGpuTimeBegin());
1627:   /* First, solve U */
1628:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1629:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1631:   /* Then, solve L */
1632:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1633:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1635:   /* restore */
1636:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1637:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1638:   PetscCall(PetscLogGpuTimeEnd());
1639:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1640:   PetscFunctionReturn(PETSC_SUCCESS);
1641: }

1643: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1644: {
1645:   const PetscScalar                    *barray;
1646:   PetscScalar                          *xarray;
1647:   thrust::device_ptr<const PetscScalar> bGPU;
1648:   thrust::device_ptr<PetscScalar>       xGPU;
1649:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1650:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1651:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1652:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1654:   PetscFunctionBegin;
1655:   /* Get the GPU pointers */
1656:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1657:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1658:   xGPU = thrust::device_pointer_cast(xarray);
1659:   bGPU = thrust::device_pointer_cast(barray);

1661:   PetscCall(PetscLogGpuTimeBegin());
1662:   /* First, reorder with the row permutation */
1663:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1665:   /* Next, solve L */
1666:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1667:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1669:   /* Then, solve U */
1670:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1671:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1673:   /* Last, reorder with the column permutation */
1674:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1676:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1677:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1678:   PetscCall(PetscLogGpuTimeEnd());
1679:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1680:   PetscFunctionReturn(PETSC_SUCCESS);
1681: }

1683: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1684: {
1685:   const PetscScalar                 *barray;
1686:   PetscScalar                       *xarray;
1687:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1688:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1689:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1690:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1692:   PetscFunctionBegin;
1693:   /* Get the GPU pointers */
1694:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1695:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1697:   PetscCall(PetscLogGpuTimeBegin());
1698:   /* First, solve L */
1699:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1700:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1702:   /* Next, solve U */
1703:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1704:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1706:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1707:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1708:   PetscCall(PetscLogGpuTimeEnd());
1709:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1710:   PetscFunctionReturn(PETSC_SUCCESS);
1711: }
1712: #endif

1714: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1715: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1716: {
1717:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1718:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1719:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1720:   CsrMatrix                    *Acsr;
1721:   PetscInt                      m, nz;
1722:   PetscBool                     flg;

1724:   PetscFunctionBegin;
1725:   if (PetscDefined(USE_DEBUG)) {
1726:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1727:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1728:   }

1730:   /* Copy A's value to fact */
1731:   m  = fact->rmap->n;
1732:   nz = aij->nz;
1733:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1734:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1735:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1737:   PetscCall(PetscLogGpuTimeBegin());
1738:   /* Factorize fact inplace */
1739:   if (m)
1740:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1741:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1742:   if (PetscDefined(USE_DEBUG)) {
1743:     int              numerical_zero;
1744:     cusparseStatus_t status;
1745:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1746:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1747:   }

1749:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1750:   if (fs->updatedSpSVAnalysis) {
1751:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753:   } else
1754:   #endif
1755:   {
1756:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1757:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1758:     */
1759:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1761:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1763:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1764:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1765:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1766:   }

1768:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1769:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1770:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1771:   fact->ops->matsolve          = NULL;
1772:   fact->ops->matsolvetranspose = NULL;
1773:   PetscCall(PetscLogGpuTimeEnd());
1774:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1775:   PetscFunctionReturn(PETSC_SUCCESS);
1776: }

1778: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1779: {
1780:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1781:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1782:   PetscInt                      m, nz;

1784:   PetscFunctionBegin;
1785:   if (PetscDefined(USE_DEBUG)) {
1786:     PetscInt  i;
1787:     PetscBool flg, missing;

1789:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1790:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1791:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1792:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1793:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1794:   }

1796:   /* Free the old stale stuff */
1797:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1799:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1800:      but they will not be used. Allocate them just for easy debugging.
1801:    */
1802:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1804:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1805:   fact->factortype             = MAT_FACTOR_ILU;
1806:   fact->info.factor_mallocs    = 0;
1807:   fact->info.fill_ratio_given  = info->fill;
1808:   fact->info.fill_ratio_needed = 1.0;

1810:   aij->row = NULL;
1811:   aij->col = NULL;

1813:   /* ====================================================================== */
1814:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1815:   /* We'll do in-place factorization on fact                                */
1816:   /* ====================================================================== */
1817:   const int *Ai, *Aj;

1819:   m  = fact->rmap->n;
1820:   nz = aij->nz;

1822:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1823:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1824:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1825:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1826:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1827:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1829:   /* ====================================================================== */
1830:   /* Create descriptors for M, L, U                                         */
1831:   /* ====================================================================== */
1832:   cusparseFillMode_t fillMode;
1833:   cusparseDiagType_t diagType;

1835:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1836:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1837:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1839:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1840:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1841:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1842:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1843:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1844:   */
1845:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1846:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1847:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1848:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1849:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1851:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1852:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1853:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1854:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1855:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1857:   /* ========================================================================= */
1858:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1859:   /* ========================================================================= */
1860:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1861:   if (m)
1862:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1863:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

1865:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1866:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1868:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1869:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1871:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1872:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1874:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1875:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1877:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1878:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1879:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1880:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1881:    */
1882:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1883:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1884:     fs->spsvBuffer_L = fs->factBuffer_M;
1885:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1886:   } else {
1887:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1888:     fs->spsvBuffer_U = fs->factBuffer_M;
1889:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1890:   }

1892:   /* ========================================================================== */
1893:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1894:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1895:   /* ========================================================================== */
1896:   int              structural_zero;
1897:   cusparseStatus_t status;

1899:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1900:   if (m)
1901:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1902:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1903:   if (PetscDefined(USE_DEBUG)) {
1904:     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1905:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1906:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1907:   }

1909:   /* Estimate FLOPs of the numeric factorization */
1910:   {
1911:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1912:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1913:     PetscLogDouble flops = 0.0;

1915:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1916:     Ai    = Aseq->i;
1917:     Adiag = Aseq->diag;
1918:     for (PetscInt i = 0; i < m; i++) {
1919:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1920:         nzRow  = Ai[i + 1] - Ai[i];
1921:         nzLeft = Adiag[i] - Ai[i];
1922:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1923:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1924:         */
1925:         nzLeft = (nzRow - 1) / 2;
1926:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1927:       }
1928:     }
1929:     fs->numericFactFlops = flops;
1930:   }
1931:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1932:   PetscFunctionReturn(PETSC_SUCCESS);
1933: }

1935: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1936: {
1937:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1938:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1939:   const PetscScalar            *barray;
1940:   PetscScalar                  *xarray;

1942:   PetscFunctionBegin;
1943:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1944:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1945:   PetscCall(PetscLogGpuTimeBegin());

1947:   /* Solve L*y = b */
1948:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1949:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1950:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1951:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1953:   /* Solve Lt*x = y */
1954:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1955:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1956:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1958:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1959:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1961:   PetscCall(PetscLogGpuTimeEnd());
1962:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1963:   PetscFunctionReturn(PETSC_SUCCESS);
1964: }

1966: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1967: {
1968:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1969:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1970:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1971:   CsrMatrix                    *Acsr;
1972:   PetscInt                      m, nz;
1973:   PetscBool                     flg;

1975:   PetscFunctionBegin;
1976:   if (PetscDefined(USE_DEBUG)) {
1977:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1978:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1979:   }

1981:   /* Copy A's value to fact */
1982:   m  = fact->rmap->n;
1983:   nz = aij->nz;
1984:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1985:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1986:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1988:   /* Factorize fact inplace */
1989:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1990:      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1991:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1992:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1993:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1994:    */
1995:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1996:   if (PetscDefined(USE_DEBUG)) {
1997:     int              numerical_zero;
1998:     cusparseStatus_t status;
1999:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2000:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2001:   }

2003:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2004:   if (fs->updatedSpSVAnalysis) {
2005:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007:   } else
2008:   #endif
2009:   {
2010:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

2012:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2013:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2014:   */
2015:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2016:     fs->updatedSpSVAnalysis = PETSC_TRUE;
2017:   }

2019:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2020:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2021:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2022:   fact->ops->matsolve          = NULL;
2023:   fact->ops->matsolvetranspose = NULL;
2024:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2025:   PetscFunctionReturn(PETSC_SUCCESS);
2026: }

2028: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2029: {
2030:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2031:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2032:   PetscInt                      m, nz;

2034:   PetscFunctionBegin;
2035:   if (PetscDefined(USE_DEBUG)) {
2036:     PetscInt  i;
2037:     PetscBool flg, missing;

2039:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2040:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2041:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2042:     PetscCall(MatMissingDiagonal(A, &missing, &i));
2043:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2044:   }

2046:   /* Free the old stale stuff */
2047:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

2049:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2050:      but they will not be used. Allocate them just for easy debugging.
2051:    */
2052:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

2054:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2055:   fact->factortype             = MAT_FACTOR_ICC;
2056:   fact->info.factor_mallocs    = 0;
2057:   fact->info.fill_ratio_given  = info->fill;
2058:   fact->info.fill_ratio_needed = 1.0;

2060:   aij->row = NULL;
2061:   aij->col = NULL;

2063:   /* ====================================================================== */
2064:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2065:   /* We'll do in-place factorization on fact                                */
2066:   /* ====================================================================== */
2067:   const int *Ai, *Aj;

2069:   m  = fact->rmap->n;
2070:   nz = aij->nz;

2072:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2073:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2074:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2075:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2076:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2077:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

2079:   /* ====================================================================== */
2080:   /* Create mat descriptors for M, L                                        */
2081:   /* ====================================================================== */
2082:   cusparseFillMode_t fillMode;
2083:   cusparseDiagType_t diagType;

2085:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2086:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2087:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

2089:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2090:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2091:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2092:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2093:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2094:   */
2095:   fillMode = CUSPARSE_FILL_MODE_LOWER;
2096:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2097:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2098:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2099:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

2101:   /* ========================================================================= */
2102:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2103:   /* ========================================================================= */
2104:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2105:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

2107:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2108:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

2110:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2111:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

2113:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2114:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

2116:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2117:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

2119:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2120:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2121:    */
2122:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2123:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2124:     fs->spsvBuffer_L = fs->factBuffer_M;
2125:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2126:   } else {
2127:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2128:     fs->spsvBuffer_Lt = fs->factBuffer_M;
2129:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2130:   }

2132:   /* ========================================================================== */
2133:   /* Perform analysis of ic0 on M                                               */
2134:   /* The lower triangular part of M has the same sparsity pattern as L          */
2135:   /* ========================================================================== */
2136:   int              structural_zero;
2137:   cusparseStatus_t status;

2139:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2140:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2141:   if (PetscDefined(USE_DEBUG)) {
2142:     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2143:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2144:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2145:   }

2147:   /* Estimate FLOPs of the numeric factorization */
2148:   {
2149:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2150:     PetscInt      *Ai, nzRow, nzLeft;
2151:     PetscLogDouble flops = 0.0;

2153:     Ai = Aseq->i;
2154:     for (PetscInt i = 0; i < m; i++) {
2155:       nzRow = Ai[i + 1] - Ai[i];
2156:       if (nzRow > 1) {
2157:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2158:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2159:         */
2160:         nzLeft = (nzRow - 1) / 2;
2161:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2162:       }
2163:     }
2164:     fs->numericFactFlops = flops;
2165:   }
2166:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2167:   PetscFunctionReturn(PETSC_SUCCESS);
2168: }
2169: #endif

2171: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2172: {
2173:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2174:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

2176:   PetscFunctionBegin;
2177:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2178:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2179:   B->offloadmask = PETSC_OFFLOAD_CPU;

2181:   if (!cusparsestruct->use_cpu_solve) {
2182: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2183:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2184:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2185: #else
2186:     /* determine which version of MatSolve needs to be used. */
2187:     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2188:     IS          isrow = b->row, iscol = b->col;
2189:     PetscBool   row_identity, col_identity;

2191:     PetscCall(ISIdentity(isrow, &row_identity));
2192:     PetscCall(ISIdentity(iscol, &col_identity));
2193:     if (row_identity && col_identity) {
2194:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2195:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2196:     } else {
2197:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2198:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2199:     }
2200: #endif
2201:   }
2202:   B->ops->matsolve          = NULL;
2203:   B->ops->matsolvetranspose = NULL;

2205:   /* get the triangular factors */
2206:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2207:   PetscFunctionReturn(PETSC_SUCCESS);
2208: }

2210: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2211: {
2212:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

2214:   PetscFunctionBegin;
2215:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218:   PetscFunctionReturn(PETSC_SUCCESS);
2219: }

2221: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2222: {
2223:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2225:   PetscFunctionBegin;
2226: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2227:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2228:   if (!info->factoronhost) {
2229:     PetscCall(ISIdentity(isrow, &row_identity));
2230:     PetscCall(ISIdentity(iscol, &col_identity));
2231:   }
2232:   if (!info->levels && row_identity && col_identity) {
2233:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2234:   } else
2235: #endif
2236:   {
2237:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2238:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2239:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2240:   }
2241:   PetscFunctionReturn(PETSC_SUCCESS);
2242: }

2244: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2245: {
2246:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2248:   PetscFunctionBegin;
2249: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2250:   PetscBool perm_identity = PETSC_FALSE;
2251:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2252:   if (!info->levels && perm_identity) {
2253:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2254:   } else
2255: #endif
2256:   {
2257:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2258:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2259:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2260:   }
2261:   PetscFunctionReturn(PETSC_SUCCESS);
2262: }

2264: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2265: {
2266:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2268:   PetscFunctionBegin;
2269:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2270:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2271:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2272:   PetscFunctionReturn(PETSC_SUCCESS);
2273: }

2275: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2276: {
2277:   PetscFunctionBegin;
2278:   *type = MATSOLVERCUSPARSE;
2279:   PetscFunctionReturn(PETSC_SUCCESS);
2280: }

2282: /*MC
2283:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2284:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2285:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2286:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2287:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2288:   algorithms are not recommended. This class does NOT support direct solver operations.

2290:   Level: beginner

2292: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2293:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2294: M*/

2296: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2297: {
2298:   PetscInt n = A->rmap->n;

2300:   PetscFunctionBegin;
2301:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2302:   PetscCall(MatSetSizes(*B, n, n, n, n));
2303:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2304:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

2306:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2307:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2308:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2309:     if (!A->boundtocpu) {
2310:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2311:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2312:     } else {
2313:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2314:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2315:     }
2316:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2317:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2318:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2319:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2320:     if (!A->boundtocpu) {
2321:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2322:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2323:     } else {
2324:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2325:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2326:     }
2327:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2328:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2329:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2331:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2332:   (*B)->canuseordering = PETSC_TRUE;
2333:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2334:   PetscFunctionReturn(PETSC_SUCCESS);
2335: }

2337: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2338: {
2339:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2340:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2341: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2342:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2343: #endif

2345:   PetscFunctionBegin;
2346:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2347:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2348:     if (A->factortype == MAT_FACTOR_NONE) {
2349:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2350:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2351:     }
2352: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2353:     else if (fs->csrVal) {
2354:       /* We have a factorized matrix on device and are able to copy it to host */
2355:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2356:     }
2357: #endif
2358:     else
2359:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2360:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2361:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2362:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2363:   }
2364:   PetscFunctionReturn(PETSC_SUCCESS);
2365: }

2367: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2368: {
2369:   PetscFunctionBegin;
2370:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2371:   *array = ((Mat_SeqAIJ *)A->data)->a;
2372:   PetscFunctionReturn(PETSC_SUCCESS);
2373: }

2375: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2376: {
2377:   PetscFunctionBegin;
2378:   A->offloadmask = PETSC_OFFLOAD_CPU;
2379:   *array         = NULL;
2380:   PetscFunctionReturn(PETSC_SUCCESS);
2381: }

2383: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2384: {
2385:   PetscFunctionBegin;
2386:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2387:   *array = ((Mat_SeqAIJ *)A->data)->a;
2388:   PetscFunctionReturn(PETSC_SUCCESS);
2389: }

2391: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2392: {
2393:   PetscFunctionBegin;
2394:   *array = NULL;
2395:   PetscFunctionReturn(PETSC_SUCCESS);
2396: }

2398: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2399: {
2400:   PetscFunctionBegin;
2401:   *array = ((Mat_SeqAIJ *)A->data)->a;
2402:   PetscFunctionReturn(PETSC_SUCCESS);
2403: }

2405: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2406: {
2407:   PetscFunctionBegin;
2408:   A->offloadmask = PETSC_OFFLOAD_CPU;
2409:   *array         = NULL;
2410:   PetscFunctionReturn(PETSC_SUCCESS);
2411: }

2413: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2414: {
2415:   Mat_SeqAIJCUSPARSE *cusp;
2416:   CsrMatrix          *matrix;

2418:   PetscFunctionBegin;
2419:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2420:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2421:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2422:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2423:   matrix = (CsrMatrix *)cusp->mat->mat;

2425:   if (i) {
2426: #if !defined(PETSC_USE_64BIT_INDICES)
2427:     *i = matrix->row_offsets->data().get();
2428: #else
2429:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2430: #endif
2431:   }
2432:   if (j) {
2433: #if !defined(PETSC_USE_64BIT_INDICES)
2434:     *j = matrix->column_indices->data().get();
2435: #else
2436:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2437: #endif
2438:   }
2439:   if (a) *a = matrix->values->data().get();
2440:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2441:   PetscFunctionReturn(PETSC_SUCCESS);
2442: }

2444: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2445: {
2446:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2447:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2448:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2449:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2450:   cusparseStatus_t              stat;
2451:   PetscBool                     both = PETSC_TRUE;

2453:   PetscFunctionBegin;
2454:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2455:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2456:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2457:       CsrMatrix *matrix;
2458:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2460:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2461:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2462:       matrix->values->assign(a->a, a->a + a->nz);
2463:       PetscCallCUDA(WaitForCUDA());
2464:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2465:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2466:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2467:     } else {
2468:       PetscInt nnz;
2469:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2470:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2471:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2472:       delete cusparsestruct->workVector;
2473:       delete cusparsestruct->rowoffsets_gpu;
2474:       cusparsestruct->workVector     = NULL;
2475:       cusparsestruct->rowoffsets_gpu = NULL;
2476:       try {
2477:         if (a->compressedrow.use) {
2478:           m    = a->compressedrow.nrows;
2479:           ii   = a->compressedrow.i;
2480:           ridx = a->compressedrow.rindex;
2481:         } else {
2482:           m    = A->rmap->n;
2483:           ii   = a->i;
2484:           ridx = NULL;
2485:         }
2486:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2487:         if (!a->a) {
2488:           nnz  = ii[m];
2489:           both = PETSC_FALSE;
2490:         } else nnz = a->nz;
2491:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2493:         /* create cusparse matrix */
2494:         cusparsestruct->nrows = m;
2495:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2496:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2497:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2498:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2500:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2501:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2502:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2503:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2504:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2508:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2509:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2510:           /* set the matrix */
2511:           CsrMatrix *mat   = new CsrMatrix;
2512:           mat->num_rows    = m;
2513:           mat->num_cols    = A->cmap->n;
2514:           mat->num_entries = nnz;
2515:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516:           mat->row_offsets->assign(ii, ii + m + 1);

2518:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519:           mat->column_indices->assign(a->j, a->j + nnz);

2521:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2524:           /* assign the pointer */
2525:           matstruct->mat = mat;
2526: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2527:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2528:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2529:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2530:             PetscCallCUSPARSE(stat);
2531:           }
2532: #endif
2533:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2534: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2535:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2536: #else
2537:           CsrMatrix *mat   = new CsrMatrix;
2538:           mat->num_rows    = m;
2539:           mat->num_cols    = A->cmap->n;
2540:           mat->num_entries = nnz;
2541:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2542:           mat->row_offsets->assign(ii, ii + m + 1);

2544:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2545:           mat->column_indices->assign(a->j, a->j + nnz);

2547:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2548:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2550:           cusparseHybMat_t hybMat;
2551:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2552:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2553:           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2554:           PetscCallCUSPARSE(stat);
2555:           /* assign the pointer */
2556:           matstruct->mat = hybMat;

2558:           if (mat) {
2559:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2560:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2561:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2562:             delete (CsrMatrix *)mat;
2563:           }
2564: #endif
2565:         }

2567:         /* assign the compressed row indices */
2568:         if (a->compressedrow.use) {
2569:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2570:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2571:           matstruct->cprowIndices->assign(ridx, ridx + m);
2572:           tmp = m;
2573:         } else {
2574:           cusparsestruct->workVector = NULL;
2575:           matstruct->cprowIndices    = NULL;
2576:           tmp                        = 0;
2577:         }
2578:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2580:         /* assign the pointer */
2581:         cusparsestruct->mat = matstruct;
2582:       } catch (char *ex) {
2583:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2584:       }
2585:       PetscCallCUDA(WaitForCUDA());
2586:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2587:       cusparsestruct->nonzerostate = A->nonzerostate;
2588:     }
2589:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2590:   }
2591:   PetscFunctionReturn(PETSC_SUCCESS);
2592: }

2594: struct VecCUDAPlusEquals {
2595:   template <typename Tuple>
2596:   __host__ __device__ void operator()(Tuple t)
2597:   {
2598:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2599:   }
2600: };

2602: struct VecCUDAEquals {
2603:   template <typename Tuple>
2604:   __host__ __device__ void operator()(Tuple t)
2605:   {
2606:     thrust::get<1>(t) = thrust::get<0>(t);
2607:   }
2608: };

2610: struct VecCUDAEqualsReverse {
2611:   template <typename Tuple>
2612:   __host__ __device__ void operator()(Tuple t)
2613:   {
2614:     thrust::get<0>(t) = thrust::get<1>(t);
2615:   }
2616: };

2618: struct MatMatCusparse {
2619:   PetscBool      cisdense;
2620:   PetscScalar   *Bt;
2621:   Mat            X;
2622:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2623:   PetscLogDouble flops;
2624:   CsrMatrix     *Bcsr;

2626: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627:   cusparseSpMatDescr_t matSpBDescr;
2628:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2629:   cusparseDnMatDescr_t matBDescr;
2630:   cusparseDnMatDescr_t matCDescr;
2631:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2632:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2633:   void *dBuffer4;
2634:   void *dBuffer5;
2635:   #endif
2636:   size_t                mmBufferSize;
2637:   void                 *mmBuffer;
2638:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2639:   cusparseSpGEMMDescr_t spgemmDesc;
2640: #endif
2641: };

2643: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2644: {
2645:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2647:   PetscFunctionBegin;
2648:   PetscCallCUDA(cudaFree(mmdata->Bt));
2649:   delete mmdata->Bcsr;
2650: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2651:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2652:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2653:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2654:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2655:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2656:   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2657:   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2658:   #endif
2659:   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2660:   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2661: #endif
2662:   PetscCall(MatDestroy(&mmdata->X));
2663:   PetscCall(PetscFree(data));
2664:   PetscFunctionReturn(PETSC_SUCCESS);
2665: }

2667: #include <../src/mat/impls/dense/seq/dense.h>

2669: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2670: {
2671:   Mat_Product                  *product = C->product;
2672:   Mat                           A, B;
2673:   PetscInt                      m, n, blda, clda;
2674:   PetscBool                     flg, biscuda;
2675:   Mat_SeqAIJCUSPARSE           *cusp;
2676:   cusparseStatus_t              stat;
2677:   cusparseOperation_t           opA;
2678:   const PetscScalar            *barray;
2679:   PetscScalar                  *carray;
2680:   MatMatCusparse               *mmdata;
2681:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2682:   CsrMatrix                    *csrmat;

2684:   PetscFunctionBegin;
2685:   MatCheckProduct(C, 1);
2686:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2687:   mmdata = (MatMatCusparse *)product->data;
2688:   A      = product->A;
2689:   B      = product->B;
2690:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2691:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2692:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2693:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2694:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2695:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2696:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2697:   switch (product->type) {
2698:   case MATPRODUCT_AB:
2699:   case MATPRODUCT_PtAP:
2700:     mat = cusp->mat;
2701:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2702:     m   = A->rmap->n;
2703:     n   = B->cmap->n;
2704:     break;
2705:   case MATPRODUCT_AtB:
2706:     if (!A->form_explicit_transpose) {
2707:       mat = cusp->mat;
2708:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2709:     } else {
2710:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2711:       mat = cusp->matTranspose;
2712:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2713:     }
2714:     m = A->cmap->n;
2715:     n = B->cmap->n;
2716:     break;
2717:   case MATPRODUCT_ABt:
2718:   case MATPRODUCT_RARt:
2719:     mat = cusp->mat;
2720:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2721:     m   = A->rmap->n;
2722:     n   = B->rmap->n;
2723:     break;
2724:   default:
2725:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2726:   }
2727:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2728:   csrmat = (CsrMatrix *)mat->mat;
2729:   /* if the user passed a CPU matrix, copy the data to the GPU */
2730:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2731:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2732:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2734:   PetscCall(MatDenseGetLDA(B, &blda));
2735:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2736:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2737:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2738:   } else {
2739:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2740:     PetscCall(MatDenseGetLDA(C, &clda));
2741:   }

2743:   PetscCall(PetscLogGpuTimeBegin());
2744: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2745:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2746:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2747:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2748:   #else
2749:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2750:   #endif

2752:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2753:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2754:     size_t mmBufferSize;
2755:     if (mmdata->initialized && mmdata->Blda != blda) {
2756:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2757:       mmdata->matBDescr = NULL;
2758:     }
2759:     if (!mmdata->matBDescr) {
2760:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2761:       mmdata->Blda = blda;
2762:     }

2764:     if (mmdata->initialized && mmdata->Clda != clda) {
2765:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2766:       mmdata->matCDescr = NULL;
2767:     }
2768:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2769:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2770:       mmdata->Clda = clda;
2771:     }

2773:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2774:     if (matADescr) {
2775:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2776:       matADescr = NULL;
2777:     }
2778:   #endif

2780:     if (!matADescr) {
2781:       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2782:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2783:       PetscCallCUSPARSE(stat);
2784:     }

2786:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

2788:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2789:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2790:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2791:       mmdata->mmBufferSize = mmBufferSize;
2792:     }

2794:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2795:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2796:   #endif

2798:     mmdata->initialized = PETSC_TRUE;
2799:   } else {
2800:     /* to be safe, always update pointers of the mats */
2801:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2802:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2803:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2804:   }

2806:   /* do cusparseSpMM, which supports transpose on B */
2807:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2808: #else
2809:   PetscInt k;
2810:   /* cusparseXcsrmm does not support transpose on B */
2811:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2812:     cublasHandle_t cublasv2handle;
2813:     cublasStatus_t cerr;

2815:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2816:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2817:     PetscCallCUBLAS(cerr);
2818:     blda = B->cmap->n;
2819:     k    = B->cmap->n;
2820:   } else {
2821:     k = B->rmap->n;
2822:   }

2824:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2825:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2826:   PetscCallCUSPARSE(stat);
2827: #endif
2828:   PetscCall(PetscLogGpuTimeEnd());
2829:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2830:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2831:   if (product->type == MATPRODUCT_RARt) {
2832:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2833:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2834:   } else if (product->type == MATPRODUCT_PtAP) {
2835:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2836:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2837:   } else {
2838:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2839:   }
2840:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2841:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2842:   PetscFunctionReturn(PETSC_SUCCESS);
2843: }

2845: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2846: {
2847:   Mat_Product        *product = C->product;
2848:   Mat                 A, B;
2849:   PetscInt            m, n;
2850:   PetscBool           cisdense, flg;
2851:   MatMatCusparse     *mmdata;
2852:   Mat_SeqAIJCUSPARSE *cusp;

2854:   PetscFunctionBegin;
2855:   MatCheckProduct(C, 1);
2856:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2857:   A = product->A;
2858:   B = product->B;
2859:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2860:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2861:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2862:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2863:   switch (product->type) {
2864:   case MATPRODUCT_AB:
2865:     m = A->rmap->n;
2866:     n = B->cmap->n;
2867:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2868:     break;
2869:   case MATPRODUCT_AtB:
2870:     m = A->cmap->n;
2871:     n = B->cmap->n;
2872:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2873:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2874:     break;
2875:   case MATPRODUCT_ABt:
2876:     m = A->rmap->n;
2877:     n = B->rmap->n;
2878:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2879:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2880:     break;
2881:   case MATPRODUCT_PtAP:
2882:     m = B->cmap->n;
2883:     n = B->cmap->n;
2884:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2885:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2886:     break;
2887:   case MATPRODUCT_RARt:
2888:     m = B->rmap->n;
2889:     n = B->rmap->n;
2890:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2891:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2892:     break;
2893:   default:
2894:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2895:   }
2896:   PetscCall(MatSetSizes(C, m, n, m, n));
2897:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2898:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2899:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2901:   /* product data */
2902:   PetscCall(PetscNew(&mmdata));
2903:   mmdata->cisdense = cisdense;
2904: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2905:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2906:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2907: #endif
2908:   /* for these products we need intermediate storage */
2909:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2910:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2911:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2912:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2913:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2914:     } else {
2915:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2916:     }
2917:   }
2918:   C->product->data    = mmdata;
2919:   C->product->destroy = MatDestroy_MatMatCusparse;

2921:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2922:   PetscFunctionReturn(PETSC_SUCCESS);
2923: }

2925: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2926: {
2927:   Mat_Product                  *product = C->product;
2928:   Mat                           A, B;
2929:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2930:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2931:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2932:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2933:   PetscBool                     flg;
2934:   cusparseStatus_t              stat;
2935:   MatProductType                ptype;
2936:   MatMatCusparse               *mmdata;
2937: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2938:   cusparseSpMatDescr_t BmatSpDescr;
2939: #endif
2940:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2942:   PetscFunctionBegin;
2943:   MatCheckProduct(C, 1);
2944:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2945:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2946:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2947:   mmdata = (MatMatCusparse *)C->product->data;
2948:   A      = product->A;
2949:   B      = product->B;
2950:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2951:     mmdata->reusesym = PETSC_FALSE;
2952:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2953:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2954:     Cmat = Ccusp->mat;
2955:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2956:     Ccsr = (CsrMatrix *)Cmat->mat;
2957:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2958:     goto finalize;
2959:   }
2960:   if (!c->nz) goto finalize;
2961:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2962:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2963:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2964:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2965:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2968:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2969:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2970:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2971:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2974:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2976:   ptype = product->type;
2977:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2978:     ptype = MATPRODUCT_AB;
2979:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2980:   }
2981:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2982:     ptype = MATPRODUCT_AB;
2983:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2984:   }
2985:   switch (ptype) {
2986:   case MATPRODUCT_AB:
2987:     Amat = Acusp->mat;
2988:     Bmat = Bcusp->mat;
2989:     break;
2990:   case MATPRODUCT_AtB:
2991:     Amat = Acusp->matTranspose;
2992:     Bmat = Bcusp->mat;
2993:     break;
2994:   case MATPRODUCT_ABt:
2995:     Amat = Acusp->mat;
2996:     Bmat = Bcusp->matTranspose;
2997:     break;
2998:   default:
2999:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3000:   }
3001:   Cmat = Ccusp->mat;
3002:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3003:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3004:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3005:   Acsr = (CsrMatrix *)Amat->mat;
3006:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3007:   Ccsr = (CsrMatrix *)Cmat->mat;
3008:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3009:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3010:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3011:   PetscCall(PetscLogGpuTimeBegin());
3012: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3013:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3014:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3015:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3016:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3017:   PetscCallCUSPARSE(stat);
3018:   #else
3019:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3020:   PetscCallCUSPARSE(stat);
3021:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3022:   PetscCallCUSPARSE(stat);
3023:   #endif
3024: #else
3025:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3026:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3027:   PetscCallCUSPARSE(stat);
3028: #endif
3029:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3030:   PetscCallCUDA(WaitForCUDA());
3031:   PetscCall(PetscLogGpuTimeEnd());
3032:   C->offloadmask = PETSC_OFFLOAD_GPU;
3033: finalize:
3034:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3035:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3036:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3037:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3038:   c->reallocs = 0;
3039:   C->info.mallocs += 0;
3040:   C->info.nz_unneeded = 0;
3041:   C->assembled = C->was_assembled = PETSC_TRUE;
3042:   C->num_ass++;
3043:   PetscFunctionReturn(PETSC_SUCCESS);
3044: }

3046: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3047: {
3048:   Mat_Product                  *product = C->product;
3049:   Mat                           A, B;
3050:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3051:   Mat_SeqAIJ                   *a, *b, *c;
3052:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3053:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3054:   PetscInt                      i, j, m, n, k;
3055:   PetscBool                     flg;
3056:   cusparseStatus_t              stat;
3057:   MatProductType                ptype;
3058:   MatMatCusparse               *mmdata;
3059:   PetscLogDouble                flops;
3060:   PetscBool                     biscompressed, ciscompressed;
3061: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3062:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3063:   cusparseSpMatDescr_t BmatSpDescr;
3064: #else
3065:   int cnz;
3066: #endif
3067:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

3069:   PetscFunctionBegin;
3070:   MatCheckProduct(C, 1);
3071:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3072:   A = product->A;
3073:   B = product->B;
3074:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3075:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3076:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3077:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3078:   a = (Mat_SeqAIJ *)A->data;
3079:   b = (Mat_SeqAIJ *)B->data;
3080:   /* product data */
3081:   PetscCall(PetscNew(&mmdata));
3082:   C->product->data    = mmdata;
3083:   C->product->destroy = MatDestroy_MatMatCusparse;

3085:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3086:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3087:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3088:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3089:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3090:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

3092:   ptype = product->type;
3093:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3094:     ptype                                          = MATPRODUCT_AB;
3095:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3096:   }
3097:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3098:     ptype                                          = MATPRODUCT_AB;
3099:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3100:   }
3101:   biscompressed = PETSC_FALSE;
3102:   ciscompressed = PETSC_FALSE;
3103:   switch (ptype) {
3104:   case MATPRODUCT_AB:
3105:     m    = A->rmap->n;
3106:     n    = B->cmap->n;
3107:     k    = A->cmap->n;
3108:     Amat = Acusp->mat;
3109:     Bmat = Bcusp->mat;
3110:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3111:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3112:     break;
3113:   case MATPRODUCT_AtB:
3114:     m = A->cmap->n;
3115:     n = B->cmap->n;
3116:     k = A->rmap->n;
3117:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3118:     Amat = Acusp->matTranspose;
3119:     Bmat = Bcusp->mat;
3120:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3121:     break;
3122:   case MATPRODUCT_ABt:
3123:     m = A->rmap->n;
3124:     n = B->rmap->n;
3125:     k = A->cmap->n;
3126:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3127:     Amat = Acusp->mat;
3128:     Bmat = Bcusp->matTranspose;
3129:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3130:     break;
3131:   default:
3132:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3133:   }

3135:   /* create cusparse matrix */
3136:   PetscCall(MatSetSizes(C, m, n, m, n));
3137:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3138:   c     = (Mat_SeqAIJ *)C->data;
3139:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3140:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3141:   Ccsr  = new CsrMatrix;

3143:   c->compressedrow.use = ciscompressed;
3144:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3145:     c->compressedrow.nrows = a->compressedrow.nrows;
3146:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3147:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3148:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3149:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3150:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3151:   } else {
3152:     c->compressedrow.nrows  = 0;
3153:     c->compressedrow.i      = NULL;
3154:     c->compressedrow.rindex = NULL;
3155:     Ccusp->workVector       = NULL;
3156:     Cmat->cprowIndices      = NULL;
3157:   }
3158:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3159:   Ccusp->mat        = Cmat;
3160:   Ccusp->mat->mat   = Ccsr;
3161:   Ccsr->num_rows    = Ccusp->nrows;
3162:   Ccsr->num_cols    = n;
3163:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3164:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3165:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3166:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3167:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3168:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3169:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3170:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3171:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3174:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3175:     c->nz                = 0;
3176:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3177:     Ccsr->values         = new THRUSTARRAY(c->nz);
3178:     goto finalizesym;
3179:   }

3181:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3182:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3183:   Acsr = (CsrMatrix *)Amat->mat;
3184:   if (!biscompressed) {
3185:     Bcsr = (CsrMatrix *)Bmat->mat;
3186: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3187:     BmatSpDescr = Bmat->matDescr;
3188: #endif
3189:   } else { /* we need to use row offsets for the full matrix */
3190:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3191:     Bcsr                 = new CsrMatrix;
3192:     Bcsr->num_rows       = B->rmap->n;
3193:     Bcsr->num_cols       = cBcsr->num_cols;
3194:     Bcsr->num_entries    = cBcsr->num_entries;
3195:     Bcsr->column_indices = cBcsr->column_indices;
3196:     Bcsr->values         = cBcsr->values;
3197:     if (!Bcusp->rowoffsets_gpu) {
3198:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3199:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3200:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3201:     }
3202:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3203:     mmdata->Bcsr      = Bcsr;
3204: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3205:     if (Bcsr->num_rows && Bcsr->num_cols) {
3206:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3207:       PetscCallCUSPARSE(stat);
3208:     }
3209:     BmatSpDescr = mmdata->matSpBDescr;
3210: #endif
3211:   }
3212:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3213:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3214:   /* precompute flops count */
3215:   if (ptype == MATPRODUCT_AB) {
3216:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3217:       const PetscInt st = a->i[i];
3218:       const PetscInt en = a->i[i + 1];
3219:       for (j = st; j < en; j++) {
3220:         const PetscInt brow = a->j[j];
3221:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3222:       }
3223:     }
3224:   } else if (ptype == MATPRODUCT_AtB) {
3225:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3226:       const PetscInt anzi = a->i[i + 1] - a->i[i];
3227:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3228:       flops += (2. * anzi) * bnzi;
3229:     }
3230:   } else { /* TODO */
3231:     flops = 0.;
3232:   }

3234:   mmdata->flops = flops;
3235:   PetscCall(PetscLogGpuTimeBegin());

3237: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3238:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3239:   // cuda-12.2 requires non-null csrRowOffsets
3240:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3241:   PetscCallCUSPARSE(stat);
3242:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3243:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3244:   {
3245:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3246:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3247:   */
3248:     void *dBuffer1 = NULL;
3249:     void *dBuffer2 = NULL;
3250:     void *dBuffer3 = NULL;
3251:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3252:     size_t bufferSize1 = 0;
3253:     size_t bufferSize2 = 0;
3254:     size_t bufferSize3 = 0;
3255:     size_t bufferSize4 = 0;
3256:     size_t bufferSize5 = 0;

3258:     /* ask bufferSize1 bytes for external memory */
3259:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3260:     PetscCallCUSPARSE(stat);
3261:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3262:     /* inspect the matrices A and B to understand the memory requirement for the next step */
3263:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3264:     PetscCallCUSPARSE(stat);

3266:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3267:     PetscCallCUSPARSE(stat);
3268:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3269:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3270:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3271:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3272:     PetscCallCUSPARSE(stat);
3273:     PetscCallCUDA(cudaFree(dBuffer1));
3274:     PetscCallCUDA(cudaFree(dBuffer2));

3276:     /* get matrix C non-zero entries C_nnz1 */
3277:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3278:     c->nz = (PetscInt)C_nnz1;
3279:     /* allocate matrix C */
3280:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3281:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3282:     Ccsr->values = new THRUSTARRAY(c->nz);
3283:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3284:     /* update matC with the new pointers */
3285:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3286:     PetscCallCUSPARSE(stat);

3288:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3289:     PetscCallCUSPARSE(stat);
3290:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3291:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3292:     PetscCallCUSPARSE(stat);
3293:     PetscCallCUDA(cudaFree(dBuffer3));
3294:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3295:     PetscCallCUSPARSE(stat);
3296:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3297:   }
3298:   #else
3299:   size_t bufSize2;
3300:   /* ask bufferSize bytes for external memory */
3301:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3302:   PetscCallCUSPARSE(stat);
3303:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3304:   /* inspect the matrices A and B to understand the memory requirement for the next step */
3305:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3306:   PetscCallCUSPARSE(stat);
3307:   /* ask bufferSize again bytes for external memory */
3308:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3309:   PetscCallCUSPARSE(stat);
3310:   /* The CUSPARSE documentation is not clear, nor the API
3311:      We need both buffers to perform the operations properly!
3312:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3313:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3314:      is stored in the descriptor! What a messy API... */
3315:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3316:   /* compute the intermediate product of A * B */
3317:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3318:   PetscCallCUSPARSE(stat);
3319:   /* get matrix C non-zero entries C_nnz1 */
3320:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3321:   c->nz = (PetscInt)C_nnz1;
3322:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3323:                       mmdata->mmBufferSize / 1024));
3324:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3325:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3326:   Ccsr->values = new THRUSTARRAY(c->nz);
3327:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3328:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3329:   PetscCallCUSPARSE(stat);
3330:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3331:   PetscCallCUSPARSE(stat);
3332:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3333: #else
3334:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3335:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3336:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3337:   PetscCallCUSPARSE(stat);
3338:   c->nz                = cnz;
3339:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3340:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3341:   Ccsr->values = new THRUSTARRAY(c->nz);
3342:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

3344:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3345:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3346:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3347:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3348:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3349:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3350:   PetscCallCUSPARSE(stat);
3351: #endif
3352:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3353:   PetscCall(PetscLogGpuTimeEnd());
3354: finalizesym:
3355:   c->free_a = PETSC_TRUE;
3356:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3357:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3358:   c->free_ij = PETSC_TRUE;
3359:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3360:     PetscInt      *d_i = c->i;
3361:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3362:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3363:     ii = *Ccsr->row_offsets;
3364:     jj = *Ccsr->column_indices;
3365:     if (ciscompressed) d_i = c->compressedrow.i;
3366:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368:   } else {
3369:     PetscInt *d_i = c->i;
3370:     if (ciscompressed) d_i = c->compressedrow.i;
3371:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373:   }
3374:   if (ciscompressed) { /* need to expand host row offsets */
3375:     PetscInt r = 0;
3376:     c->i[0]    = 0;
3377:     for (k = 0; k < c->compressedrow.nrows; k++) {
3378:       const PetscInt next = c->compressedrow.rindex[k];
3379:       const PetscInt old  = c->compressedrow.i[k];
3380:       for (; r < next; r++) c->i[r + 1] = old;
3381:     }
3382:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3383:   }
3384:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3385:   PetscCall(PetscMalloc1(m, &c->ilen));
3386:   PetscCall(PetscMalloc1(m, &c->imax));
3387:   c->maxnz         = c->nz;
3388:   c->nonzerorowcnt = 0;
3389:   c->rmax          = 0;
3390:   for (k = 0; k < m; k++) {
3391:     const PetscInt nn = c->i[k + 1] - c->i[k];
3392:     c->ilen[k] = c->imax[k] = nn;
3393:     c->nonzerorowcnt += (PetscInt)!!nn;
3394:     c->rmax = PetscMax(c->rmax, nn);
3395:   }
3396:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3397:   PetscCall(PetscMalloc1(c->nz, &c->a));
3398:   Ccsr->num_entries = c->nz;

3400:   C->nonzerostate++;
3401:   PetscCall(PetscLayoutSetUp(C->rmap));
3402:   PetscCall(PetscLayoutSetUp(C->cmap));
3403:   Ccusp->nonzerostate = C->nonzerostate;
3404:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3405:   C->preallocated     = PETSC_TRUE;
3406:   C->assembled        = PETSC_FALSE;
3407:   C->was_assembled    = PETSC_FALSE;
3408:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3409:     mmdata->reusesym = PETSC_TRUE;
3410:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3411:   }
3412:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3413:   PetscFunctionReturn(PETSC_SUCCESS);
3414: }

3416: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3418: /* handles sparse or dense B */
3419: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3420: {
3421:   Mat_Product *product = mat->product;
3422:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3424:   PetscFunctionBegin;
3425:   MatCheckProduct(mat, 1);
3426:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3427:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3428:   if (product->type == MATPRODUCT_ABC) {
3429:     Ciscusp = PETSC_FALSE;
3430:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3431:   }
3432:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3433:     PetscBool usecpu = PETSC_FALSE;
3434:     switch (product->type) {
3435:     case MATPRODUCT_AB:
3436:       if (product->api_user) {
3437:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3438:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3439:         PetscOptionsEnd();
3440:       } else {
3441:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3442:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3443:         PetscOptionsEnd();
3444:       }
3445:       break;
3446:     case MATPRODUCT_AtB:
3447:       if (product->api_user) {
3448:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3449:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3450:         PetscOptionsEnd();
3451:       } else {
3452:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3453:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3454:         PetscOptionsEnd();
3455:       }
3456:       break;
3457:     case MATPRODUCT_PtAP:
3458:       if (product->api_user) {
3459:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3460:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3461:         PetscOptionsEnd();
3462:       } else {
3463:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3464:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3465:         PetscOptionsEnd();
3466:       }
3467:       break;
3468:     case MATPRODUCT_RARt:
3469:       if (product->api_user) {
3470:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3471:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3472:         PetscOptionsEnd();
3473:       } else {
3474:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3475:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3476:         PetscOptionsEnd();
3477:       }
3478:       break;
3479:     case MATPRODUCT_ABC:
3480:       if (product->api_user) {
3481:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3482:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3483:         PetscOptionsEnd();
3484:       } else {
3485:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3486:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3487:         PetscOptionsEnd();
3488:       }
3489:       break;
3490:     default:
3491:       break;
3492:     }
3493:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3494:   }
3495:   /* dispatch */
3496:   if (isdense) {
3497:     switch (product->type) {
3498:     case MATPRODUCT_AB:
3499:     case MATPRODUCT_AtB:
3500:     case MATPRODUCT_ABt:
3501:     case MATPRODUCT_PtAP:
3502:     case MATPRODUCT_RARt:
3503:       if (product->A->boundtocpu) {
3504:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3505:       } else {
3506:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3507:       }
3508:       break;
3509:     case MATPRODUCT_ABC:
3510:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3511:       break;
3512:     default:
3513:       break;
3514:     }
3515:   } else if (Biscusp && Ciscusp) {
3516:     switch (product->type) {
3517:     case MATPRODUCT_AB:
3518:     case MATPRODUCT_AtB:
3519:     case MATPRODUCT_ABt:
3520:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3521:       break;
3522:     case MATPRODUCT_PtAP:
3523:     case MATPRODUCT_RARt:
3524:     case MATPRODUCT_ABC:
3525:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3526:       break;
3527:     default:
3528:       break;
3529:     }
3530:   } else { /* fallback for AIJ */
3531:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3532:   }
3533:   PetscFunctionReturn(PETSC_SUCCESS);
3534: }

3536: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3537: {
3538:   PetscFunctionBegin;
3539:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3540:   PetscFunctionReturn(PETSC_SUCCESS);
3541: }

3543: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3544: {
3545:   PetscFunctionBegin;
3546:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3547:   PetscFunctionReturn(PETSC_SUCCESS);
3548: }

3550: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3551: {
3552:   PetscFunctionBegin;
3553:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3554:   PetscFunctionReturn(PETSC_SUCCESS);
3555: }

3557: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3558: {
3559:   PetscFunctionBegin;
3560:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3561:   PetscFunctionReturn(PETSC_SUCCESS);
3562: }

3564: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3565: {
3566:   PetscFunctionBegin;
3567:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3568:   PetscFunctionReturn(PETSC_SUCCESS);
3569: }

3571: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3572: {
3573:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3574:   if (i < n) y[idx[i]] += x[i];
3575: }

3577: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3578: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3579: {
3580:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3581:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3582:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3583:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3584:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3585:   PetscBool                     compressed;
3586: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3587:   PetscInt nx, ny;
3588: #endif

3590:   PetscFunctionBegin;
3591:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3592:   if (!a->nz) {
3593:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3594:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3595:     PetscFunctionReturn(PETSC_SUCCESS);
3596:   }
3597:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3598:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3599:   if (!trans) {
3600:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3601:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3602:   } else {
3603:     if (herm || !A->form_explicit_transpose) {
3604:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3605:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3606:     } else {
3607:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3608:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3609:     }
3610:   }
3611:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3612:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3614:   try {
3615:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3616:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3617:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3619:     PetscCall(PetscLogGpuTimeBegin());
3620:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3621:       /* z = A x + beta y.
3622:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3623:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3624:       */
3625:       xptr = xarray;
3626:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3627:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3628: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3629:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3630:           allocated to accommodate different uses. So we get the length info directly from mat.
3631:        */
3632:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3633:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634:         nx             = mat->num_cols; // since y = Ax
3635:         ny             = mat->num_rows;
3636:       }
3637: #endif
3638:     } else {
3639:       /* z = A^T x + beta y
3640:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3641:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3642:        */
3643:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3644:       dptr = zarray;
3645:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3646:       if (compressed) { /* Scatter x to work vector */
3647:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3649:         thrust::for_each(
3650: #if PetscDefined(HAVE_THRUST_ASYNC)
3651:           thrust::cuda::par.on(PetscDefaultCudaStream),
3652: #endif
3653:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3654:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3655:       }
3656: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3657:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3658:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3659:         nx             = mat->num_rows; // since y = A^T x
3660:         ny             = mat->num_cols;
3661:       }
3662: #endif
3663:     }

3665:     /* csr_spmv does y = alpha op(A) x + beta y */
3666:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3667: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3668:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3669:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3670:   #else
3671:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3672:   #endif

3674:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3675:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3676:       if (!matDescr) {
3677:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3678:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3679:       }
3680:   #endif

3682:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3683:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3684:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3685:         PetscCallCUSPARSE(
3686:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3687:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3688:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3689:         PetscCallCUSPARSE(
3690:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3691:   #endif
3692:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3693:       } else {
3694:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3695:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3696:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3697:       }

3699:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3700: #else
3701:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3702:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3703: #endif
3704:     } else {
3705:       if (cusparsestruct->nrows) {
3706: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3707:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3708: #else
3709:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3710:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3711: #endif
3712:       }
3713:     }
3714:     PetscCall(PetscLogGpuTimeEnd());

3716:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3717:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3718:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3719:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3720:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3721:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3722:         }
3723:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3724:         PetscCall(VecSeq_CUDA::Set(zz, 0));
3725:       }

3727:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3728:       if (compressed) {
3729:         PetscCall(PetscLogGpuTimeBegin());
3730:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3731:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3732:         PetscCall(PetscLogGpuTimeEnd());
3733:       }
3734:     } else {
3735:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3736:     }
3737:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3738:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3739:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3740:   } catch (char *ex) {
3741:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3742:   }
3743:   if (yy) {
3744:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3745:   } else {
3746:     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3747:   }
3748:   PetscFunctionReturn(PETSC_SUCCESS);
3749: }

3751: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3752: {
3753:   PetscFunctionBegin;
3754:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3755:   PetscFunctionReturn(PETSC_SUCCESS);
3756: }

3758: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3759: {
3760:   PetscFunctionBegin;
3761:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3762:   PetscFunctionReturn(PETSC_SUCCESS);
3763: }

3765: /*@
3766:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3767:   (the default parallel PETSc format).

3769:   Collective

3771:   Input Parameters:
3772: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3773: . m    - number of rows
3774: . n    - number of columns
3775: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3776: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3778:   Output Parameter:
3779: . A - the matrix

3781:   Level: intermediate

3783:   Notes:
3784:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3785:   calculations. For good matrix assembly performance the user should preallocate the matrix
3786:   storage by setting the parameter `nz` (or the array `nnz`).

3788:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3789:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3790:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3792:   The AIJ format, also called
3793:   compressed row storage, is fully compatible with standard Fortran
3794:   storage.  That is, the stored row and column indices can begin at
3795:   either one (as in Fortran) or zero.

3797:   Specify the preallocated storage with either nz or nnz (not both).
3798:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3799:   allocation.

3801: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3802: @*/
3803: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3804: {
3805:   PetscFunctionBegin;
3806:   PetscCall(MatCreate(comm, A));
3807:   PetscCall(MatSetSizes(*A, m, n, m, n));
3808:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3809:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3810:   PetscFunctionReturn(PETSC_SUCCESS);
3811: }

3813: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3814: {
3815:   PetscFunctionBegin;
3816:   if (A->factortype == MAT_FACTOR_NONE) {
3817:     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3818:   } else {
3819:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3820:   }
3821:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3822:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3823:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3824:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3825:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3826:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3827:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3828:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3829:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3830:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3831:   PetscCall(MatDestroy_SeqAIJ(A));
3832:   PetscFunctionReturn(PETSC_SUCCESS);
3833: }

3835: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3836: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3837: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3838: {
3839:   PetscFunctionBegin;
3840:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3841:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3842:   PetscFunctionReturn(PETSC_SUCCESS);
3843: }

3845: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3846: {
3847:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3848:   Mat_SeqAIJCUSPARSE *cy;
3849:   Mat_SeqAIJCUSPARSE *cx;
3850:   PetscScalar        *ay;
3851:   const PetscScalar  *ax;
3852:   CsrMatrix          *csry, *csrx;

3854:   PetscFunctionBegin;
3855:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3856:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3857:   if (X->ops->axpy != Y->ops->axpy) {
3858:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3859:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3860:     PetscFunctionReturn(PETSC_SUCCESS);
3861:   }
3862:   /* if we are here, it means both matrices are bound to GPU */
3863:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3864:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3865:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3866:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867:   csry = (CsrMatrix *)cy->mat->mat;
3868:   csrx = (CsrMatrix *)cx->mat->mat;
3869:   /* see if we can turn this into a cublas axpy */
3870:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3871:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3872:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3873:     if (eq) str = SAME_NONZERO_PATTERN;
3874:   }
3875:   /* spgeam is buggy with one column */
3876:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3878:   if (str == SUBSET_NONZERO_PATTERN) {
3879:     PetscScalar b = 1.0;
3880: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3881:     size_t bufferSize;
3882:     void  *buffer;
3883: #endif

3885:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3886:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3887:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3888: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3889:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3890:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3891:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3892:     PetscCall(PetscLogGpuTimeBegin());
3893:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3894:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3895:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3896:     PetscCall(PetscLogGpuTimeEnd());
3897:     PetscCallCUDA(cudaFree(buffer));
3898: #else
3899:     PetscCall(PetscLogGpuTimeBegin());
3900:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3901:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3902:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3903:     PetscCall(PetscLogGpuTimeEnd());
3904: #endif
3905:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3906:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3907:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3908:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3909:   } else if (str == SAME_NONZERO_PATTERN) {
3910:     cublasHandle_t cublasv2handle;
3911:     PetscBLASInt   one = 1, bnz = 1;

3913:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3914:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3915:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3916:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3917:     PetscCall(PetscLogGpuTimeBegin());
3918:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3919:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3920:     PetscCall(PetscLogGpuTimeEnd());
3921:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3922:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3923:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3924:   } else {
3925:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3926:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3927:   }
3928:   PetscFunctionReturn(PETSC_SUCCESS);
3929: }

3931: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3932: {
3933:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3934:   PetscScalar   *ay;
3935:   cublasHandle_t cublasv2handle;
3936:   PetscBLASInt   one = 1, bnz = 1;

3938:   PetscFunctionBegin;
3939:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3940:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3941:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3942:   PetscCall(PetscLogGpuTimeBegin());
3943:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3944:   PetscCall(PetscLogGpuFlops(bnz));
3945:   PetscCall(PetscLogGpuTimeEnd());
3946:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3947:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3948:   PetscFunctionReturn(PETSC_SUCCESS);
3949: }

3951: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3952: {
3953:   PetscBool   both = PETSC_FALSE;
3954:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3956:   PetscFunctionBegin;
3957:   if (A->factortype == MAT_FACTOR_NONE) {
3958:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3959:     if (spptr->mat) {
3960:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3961:       if (matrix->values) {
3962:         both = PETSC_TRUE;
3963:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3964:       }
3965:     }
3966:     if (spptr->matTranspose) {
3967:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3968:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3969:     }
3970:   }
3971:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3972:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3973:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3974:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3975:   PetscFunctionReturn(PETSC_SUCCESS);
3976: }

3978: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3979: {
3980:   PetscFunctionBegin;
3981:   *m = PETSC_MEMTYPE_CUDA;
3982:   PetscFunctionReturn(PETSC_SUCCESS);
3983: }

3985: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3986: {
3987:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3989:   PetscFunctionBegin;
3990:   if (A->factortype != MAT_FACTOR_NONE) {
3991:     A->boundtocpu = flg;
3992:     PetscFunctionReturn(PETSC_SUCCESS);
3993:   }
3994:   if (flg) {
3995:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

3997:     A->ops->scale                     = MatScale_SeqAIJ;
3998:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3999:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4000:     A->ops->mult                      = MatMult_SeqAIJ;
4001:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4002:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4003:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4004:     A->ops->multhermitiantranspose    = NULL;
4005:     A->ops->multhermitiantransposeadd = NULL;
4006:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4007:     A->ops->getcurrentmemtype         = NULL;
4008:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4009:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4010:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4011:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4012:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4013:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4014:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4015:   } else {
4016:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4017:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4018:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4019:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4020:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4021:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4022:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4023:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4024:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4025:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4026:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4027:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4028:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4029:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4030:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4031:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4032:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4033:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

4035:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4036:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4037:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4038:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4039:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4040:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4041:   }
4042:   A->boundtocpu = flg;
4043:   if (flg && a->inode.size_csr) {
4044:     a->inode.use = PETSC_TRUE;
4045:   } else {
4046:     a->inode.use = PETSC_FALSE;
4047:   }
4048:   PetscFunctionReturn(PETSC_SUCCESS);
4049: }

4051: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4052: {
4053:   Mat B;

4055:   PetscFunctionBegin;
4056:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4057:   if (reuse == MAT_INITIAL_MATRIX) {
4058:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4059:   } else if (reuse == MAT_REUSE_MATRIX) {
4060:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4061:   }
4062:   B = *newmat;

4064:   PetscCall(PetscFree(B->defaultvectype));
4065:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

4067:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4068:     if (B->factortype == MAT_FACTOR_NONE) {
4069:       Mat_SeqAIJCUSPARSE *spptr;
4070:       PetscCall(PetscNew(&spptr));
4071:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4072:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4073:       spptr->format = MAT_CUSPARSE_CSR;
4074: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4075:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4076:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4077:   #else
4078:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4079:   #endif
4080:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4081:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4082: #endif
4083:       B->spptr = spptr;
4084:     } else {
4085:       Mat_SeqAIJCUSPARSETriFactors *spptr;

4087:       PetscCall(PetscNew(&spptr));
4088:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4089:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4090:       B->spptr = spptr;
4091:     }
4092:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4093:   }
4094:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4095:   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4096:   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4097:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4098:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4099:   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4100:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;

4102:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4103:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4104:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4105: #if defined(PETSC_HAVE_HYPRE)
4106:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4107: #endif
4108:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4109:   PetscFunctionReturn(PETSC_SUCCESS);
4110: }

4112: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4113: {
4114:   PetscFunctionBegin;
4115:   PetscCall(MatCreate_SeqAIJ(B));
4116:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4117:   PetscFunctionReturn(PETSC_SUCCESS);
4118: }

4120: /*MC
4121:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

4123:    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4124:    CSR, ELL, or Hybrid format.
4125:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

4127:    Options Database Keys:
4128: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4129: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4130:                                       Other options include ell (ellpack) or hyb (hybrid).
4131: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4132: -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU

4134:   Level: beginner

4136: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4137: M*/

4139: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4140: {
4141:   PetscFunctionBegin;
4142:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4143:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4144:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4145:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4146:   PetscFunctionReturn(PETSC_SUCCESS);
4147: }

4149: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4150: {
4151:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

4153:   PetscFunctionBegin;
4154:   if (cusp) {
4155:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4156:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4157:     delete cusp->workVector;
4158:     delete cusp->rowoffsets_gpu;
4159:     delete cusp->csr2csc_i;
4160:     delete cusp->coords;
4161:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4162:     PetscCall(PetscFree(mat->spptr));
4163:   }
4164:   PetscFunctionReturn(PETSC_SUCCESS);
4165: }

4167: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4168: {
4169:   PetscFunctionBegin;
4170:   if (*mat) {
4171:     delete (*mat)->values;
4172:     delete (*mat)->column_indices;
4173:     delete (*mat)->row_offsets;
4174:     delete *mat;
4175:     *mat = 0;
4176:   }
4177:   PetscFunctionReturn(PETSC_SUCCESS);
4178: }

4180: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4181: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4182: {
4183:   PetscFunctionBegin;
4184:   if (*trifactor) {
4185:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4186:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4187:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4188:     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4189:     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4190:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4191:     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4192:   #endif
4193:     PetscCall(PetscFree(*trifactor));
4194:   }
4195:   PetscFunctionReturn(PETSC_SUCCESS);
4196: }
4197: #endif

4199: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4200: {
4201:   CsrMatrix *mat;

4203:   PetscFunctionBegin;
4204:   if (*matstruct) {
4205:     if ((*matstruct)->mat) {
4206:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4207: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4208:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4209: #else
4210:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4211:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4212: #endif
4213:       } else {
4214:         mat = (CsrMatrix *)(*matstruct)->mat;
4215:         PetscCall(CsrMatrix_Destroy(&mat));
4216:       }
4217:     }
4218:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4219:     delete (*matstruct)->cprowIndices;
4220:     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4221:     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4222:     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));

4224: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4225:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4226:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

4228:     for (int i = 0; i < 3; i++) {
4229:       if (mdata->cuSpMV[i].initialized) {
4230:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4231:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4232:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4233:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4234:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4235:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4236:   #endif
4237:       }
4238:     }
4239: #endif
4240:     delete *matstruct;
4241:     *matstruct = NULL;
4242:   }
4243:   PetscFunctionReturn(PETSC_SUCCESS);
4244: }

4246: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4247: {
4248:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

4250:   PetscFunctionBegin;
4251:   if (fs) {
4252: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4253:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4254:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4255:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4256:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4257:     delete fs->workVector;
4258:     fs->workVector = NULL;
4259: #endif
4260:     delete fs->rpermIndices;
4261:     delete fs->cpermIndices;
4262:     fs->rpermIndices  = NULL;
4263:     fs->cpermIndices  = NULL;
4264:     fs->init_dev_prop = PETSC_FALSE;
4265: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4266:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4267:     PetscCallCUDA(cudaFree(fs->csrColIdx));
4268:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4269:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4270:     PetscCallCUDA(cudaFree(fs->csrVal));
4271:     PetscCallCUDA(cudaFree(fs->diag));
4272:     PetscCallCUDA(cudaFree(fs->X));
4273:     PetscCallCUDA(cudaFree(fs->Y));
4274:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4275:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4276:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4277:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4278:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4279:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4280:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4281:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4282:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4283:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4284:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4285:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4286:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4287:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4288:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4289:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4290:     PetscCall(PetscFree(fs->csrRowPtr_h));
4291:     PetscCall(PetscFree(fs->csrVal_h));
4292:     PetscCall(PetscFree(fs->diag_h));
4293:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4294:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4295: #endif
4296:   }
4297:   PetscFunctionReturn(PETSC_SUCCESS);
4298: }

4300: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4301: {
4302:   PetscFunctionBegin;
4303:   if (*trifactors) {
4304:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4305:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4306:     PetscCall(PetscFree(*trifactors));
4307:   }
4308:   PetscFunctionReturn(PETSC_SUCCESS);
4309: }

4311: struct IJCompare {
4312:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4313:   {
4314:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4315:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4316:     return false;
4317:   }
4318: };

4320: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4321: {
4322:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4324:   PetscFunctionBegin;
4325:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4326:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4327:   if (destroy) {
4328:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4329:     delete cusp->csr2csc_i;
4330:     cusp->csr2csc_i = NULL;
4331:   }
4332:   A->transupdated = PETSC_FALSE;
4333:   PetscFunctionReturn(PETSC_SUCCESS);
4334: }

4336: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4337: {
4338:   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;

4340:   PetscFunctionBegin;
4341:   PetscCallCUDA(cudaFree(coo->perm));
4342:   PetscCallCUDA(cudaFree(coo->jmap));
4343:   PetscCall(PetscFree(coo));
4344:   PetscFunctionReturn(PETSC_SUCCESS);
4345: }

4347: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4348: {
4349:   PetscBool            dev_ij = PETSC_FALSE;
4350:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4351:   PetscInt            *i, *j;
4352:   PetscContainer       container_h;
4353:   MatCOOStruct_SeqAIJ *coo_h, *coo_d;

4355:   PetscFunctionBegin;
4356:   PetscCall(PetscGetMemType(coo_i, &mtype));
4357:   if (PetscMemTypeDevice(mtype)) {
4358:     dev_ij = PETSC_TRUE;
4359:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4360:     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4361:     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4362:   } else {
4363:     i = coo_i;
4364:     j = coo_j;
4365:   }

4367:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4368:   if (dev_ij) PetscCall(PetscFree2(i, j));
4369:   mat->offloadmask = PETSC_OFFLOAD_CPU;
4370:   // Create the GPU memory
4371:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));

4373:   // Copy the COO struct to device
4374:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4375:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4376:   PetscCall(PetscMalloc1(1, &coo_d));
4377:   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4378:   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4379:   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4380:   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4381:   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));

4383:   // Put the COO struct in a container and then attach that to the matrix
4384:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4385:   PetscFunctionReturn(PETSC_SUCCESS);
4386: }

4388: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4389: {
4390:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4391:   const PetscCount grid_size = gridDim.x * blockDim.x;
4392:   for (; i < nnz; i += grid_size) {
4393:     PetscScalar sum = 0.0;
4394:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4395:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4396:   }
4397: }

4399: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4400: {
4401:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4402:   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4403:   PetscCount           Annz = seq->nz;
4404:   PetscMemType         memtype;
4405:   const PetscScalar   *v1 = v;
4406:   PetscScalar         *Aa;
4407:   PetscContainer       container;
4408:   MatCOOStruct_SeqAIJ *coo;

4410:   PetscFunctionBegin;
4411:   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));

4413:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4414:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

4416:   PetscCall(PetscGetMemType(v, &memtype));
4417:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4418:     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4419:     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4420:   }

4422:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4423:   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4425:   PetscCall(PetscLogGpuTimeBegin());
4426:   if (Annz) {
4427:     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4428:     PetscCallCUDA(cudaPeekAtLastError());
4429:   }
4430:   PetscCall(PetscLogGpuTimeEnd());

4432:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4433:   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4435:   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4436:   PetscFunctionReturn(PETSC_SUCCESS);
4437: }

4439: /*@C
4440:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4442:   Not Collective

4444:   Input Parameters:
4445: + A          - the matrix
4446: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4448:   Output Parameters:
4449: + i - the CSR row pointers
4450: - j - the CSR column indices

4452:   Level: developer

4454:   Note:
4455:   When compressed is true, the CSR structure does not contain empty rows

4457: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4458: @*/
4459: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4460: {
4461:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4462:   CsrMatrix          *csr;
4463:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4465:   PetscFunctionBegin;
4467:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4468:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4469:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4470:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4471:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4472:   csr = (CsrMatrix *)cusp->mat->mat;
4473:   if (i) {
4474:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4475:       if (!cusp->rowoffsets_gpu) {
4476:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4477:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4478:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4479:       }
4480:       *i = cusp->rowoffsets_gpu->data().get();
4481:     } else *i = csr->row_offsets->data().get();
4482:   }
4483:   if (j) *j = csr->column_indices->data().get();
4484:   PetscFunctionReturn(PETSC_SUCCESS);
4485: }

4487: /*@C
4488:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4490:   Not Collective

4492:   Input Parameters:
4493: + A          - the matrix
4494: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4495: . i          - the CSR row pointers
4496: - j          - the CSR column indices

4498:   Level: developer

4500: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4501: @*/
4502: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4503: {
4504:   PetscFunctionBegin;
4506:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4507:   if (i) *i = NULL;
4508:   if (j) *j = NULL;
4509:   (void)compressed;
4510:   PetscFunctionReturn(PETSC_SUCCESS);
4511: }

4513: /*@C
4514:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4516:   Not Collective

4518:   Input Parameter:
4519: . A - a `MATSEQAIJCUSPARSE` matrix

4521:   Output Parameter:
4522: . a - pointer to the device data

4524:   Level: developer

4526:   Note:
4527:   May trigger host-device copies if up-to-date matrix data is on host

4529: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4530: @*/
4531: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4532: {
4533:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4534:   CsrMatrix          *csr;

4536:   PetscFunctionBegin;
4538:   PetscAssertPointer(a, 2);
4539:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4540:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4541:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4542:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4543:   csr = (CsrMatrix *)cusp->mat->mat;
4544:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4545:   *a = csr->values->data().get();
4546:   PetscFunctionReturn(PETSC_SUCCESS);
4547: }

4549: /*@C
4550:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4552:   Not Collective

4554:   Input Parameters:
4555: + A - a `MATSEQAIJCUSPARSE` matrix
4556: - a - pointer to the device data

4558:   Level: developer

4560: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4561: @*/
4562: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4563: {
4564:   PetscFunctionBegin;
4566:   PetscAssertPointer(a, 2);
4567:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4568:   *a = NULL;
4569:   PetscFunctionReturn(PETSC_SUCCESS);
4570: }

4572: /*@C
4573:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4575:   Not Collective

4577:   Input Parameter:
4578: . A - a `MATSEQAIJCUSPARSE` matrix

4580:   Output Parameter:
4581: . a - pointer to the device data

4583:   Level: developer

4585:   Note:
4586:   May trigger host-device copies if up-to-date matrix data is on host

4588: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4589: @*/
4590: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4591: {
4592:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4593:   CsrMatrix          *csr;

4595:   PetscFunctionBegin;
4597:   PetscAssertPointer(a, 2);
4598:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4600:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4601:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4602:   csr = (CsrMatrix *)cusp->mat->mat;
4603:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4604:   *a             = csr->values->data().get();
4605:   A->offloadmask = PETSC_OFFLOAD_GPU;
4606:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4607:   PetscFunctionReturn(PETSC_SUCCESS);
4608: }
4609: /*@C
4610:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4612:   Not Collective

4614:   Input Parameters:
4615: + A - a `MATSEQAIJCUSPARSE` matrix
4616: - a - pointer to the device data

4618:   Level: developer

4620: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4621: @*/
4622: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4623: {
4624:   PetscFunctionBegin;
4626:   PetscAssertPointer(a, 2);
4627:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4628:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4629:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4630:   *a = NULL;
4631:   PetscFunctionReturn(PETSC_SUCCESS);
4632: }

4634: /*@C
4635:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4637:   Not Collective

4639:   Input Parameter:
4640: . A - a `MATSEQAIJCUSPARSE` matrix

4642:   Output Parameter:
4643: . a - pointer to the device data

4645:   Level: developer

4647:   Note:
4648:   Does not trigger host-device copies and flags data validity on the GPU

4650: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4651: @*/
4652: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4653: {
4654:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4655:   CsrMatrix          *csr;

4657:   PetscFunctionBegin;
4659:   PetscAssertPointer(a, 2);
4660:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4661:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4662:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4663:   csr = (CsrMatrix *)cusp->mat->mat;
4664:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4665:   *a             = csr->values->data().get();
4666:   A->offloadmask = PETSC_OFFLOAD_GPU;
4667:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4668:   PetscFunctionReturn(PETSC_SUCCESS);
4669: }

4671: /*@C
4672:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4674:   Not Collective

4676:   Input Parameters:
4677: + A - a `MATSEQAIJCUSPARSE` matrix
4678: - a - pointer to the device data

4680:   Level: developer

4682: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4683: @*/
4684: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4685: {
4686:   PetscFunctionBegin;
4688:   PetscAssertPointer(a, 2);
4689:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4690:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4691:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4692:   *a = NULL;
4693:   PetscFunctionReturn(PETSC_SUCCESS);
4694: }

4696: struct IJCompare4 {
4697:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4698:   {
4699:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4700:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4701:     return false;
4702:   }
4703: };

4705: struct Shift {
4706:   int _shift;

4708:   Shift(int shift) : _shift(shift) { }
4709:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4710: };

4712: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4713: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4714: {
4715:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4716:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4717:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4718:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4719:   PetscInt                      Annz, Bnnz;
4720:   cusparseStatus_t              stat;
4721:   PetscInt                      i, m, n, zero = 0;

4723:   PetscFunctionBegin;
4726:   PetscAssertPointer(C, 4);
4727:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4728:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4729:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4730:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4731:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4732:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733:   if (reuse == MAT_INITIAL_MATRIX) {
4734:     m = A->rmap->n;
4735:     n = A->cmap->n + B->cmap->n;
4736:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4737:     PetscCall(MatSetSizes(*C, m, n, m, n));
4738:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4739:     c                       = (Mat_SeqAIJ *)(*C)->data;
4740:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4741:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4742:     Ccsr                    = new CsrMatrix;
4743:     Cmat->cprowIndices      = NULL;
4744:     c->compressedrow.use    = PETSC_FALSE;
4745:     c->compressedrow.nrows  = 0;
4746:     c->compressedrow.i      = NULL;
4747:     c->compressedrow.rindex = NULL;
4748:     Ccusp->workVector       = NULL;
4749:     Ccusp->nrows            = m;
4750:     Ccusp->mat              = Cmat;
4751:     Ccusp->mat->mat         = Ccsr;
4752:     Ccsr->num_rows          = m;
4753:     Ccsr->num_cols          = n;
4754:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4755:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4756:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4757:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4758:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4759:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4760:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4761:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4763:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4764:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4765:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4766:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4768:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4769:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4770:     Annz                 = (PetscInt)Acsr->column_indices->size();
4771:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4772:     c->nz                = Annz + Bnnz;
4773:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4774:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4775:     Ccsr->values         = new THRUSTARRAY(c->nz);
4776:     Ccsr->num_entries    = c->nz;
4777:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4778:     if (c->nz) {
4779:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4780:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4781:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4782:       THRUSTINTARRAY32 *Aroff, *Broff;

4784:       if (a->compressedrow.use) { /* need full row offset */
4785:         if (!Acusp->rowoffsets_gpu) {
4786:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4787:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4788:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4789:         }
4790:         Aroff = Acusp->rowoffsets_gpu;
4791:       } else Aroff = Acsr->row_offsets;
4792:       if (b->compressedrow.use) { /* need full row offset */
4793:         if (!Bcusp->rowoffsets_gpu) {
4794:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4795:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4796:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4797:         }
4798:         Broff = Bcusp->rowoffsets_gpu;
4799:       } else Broff = Bcsr->row_offsets;
4800:       PetscCall(PetscLogGpuTimeBegin());
4801:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4802:       PetscCallCUSPARSE(stat);
4803:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4804:       PetscCallCUSPARSE(stat);
4805:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4806:       auto Aperm = thrust::make_constant_iterator(1);
4807:       auto Bperm = thrust::make_constant_iterator(0);
4808: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4809:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4810:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4811: #else
4812:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4813:       auto Bcib = Bcsr->column_indices->begin();
4814:       auto Bcie = Bcsr->column_indices->end();
4815:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4816: #endif
4817:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4818:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4819:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4820:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4821:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4822:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4823:       auto p1    = Ccusp->coords->begin();
4824:       auto p2    = Ccusp->coords->begin();
4825:       thrust::advance(p2, Annz);
4826:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4827: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4828:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4829: #endif
4830:       auto cci = thrust::make_counting_iterator(zero);
4831:       auto cce = thrust::make_counting_iterator(c->nz);
4832: #if 0 //Errors on SUMMIT cuda 11.1.0
4833:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4834: #else
4835:   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4836:       auto pred = thrust::identity<int>();
4837:   #else
4838:       auto pred = cuda::std::identity();
4839:   #endif
4840:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4841:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4842: #endif
4843:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4844:       PetscCallCUSPARSE(stat);
4845:       PetscCall(PetscLogGpuTimeEnd());
4846:       delete wPerm;
4847:       delete Acoo;
4848:       delete Bcoo;
4849:       delete Ccoo;
4850: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4851:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4852:       PetscCallCUSPARSE(stat);
4853: #endif
4854:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4855:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4856:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4857:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4858:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4859:         CsrMatrix                    *CcsrT = new CsrMatrix;
4860:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4861:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4863:         (*C)->form_explicit_transpose = PETSC_TRUE;
4864:         (*C)->transupdated            = PETSC_TRUE;
4865:         Ccusp->rowoffsets_gpu         = NULL;
4866:         CmatT->cprowIndices           = NULL;
4867:         CmatT->mat                    = CcsrT;
4868:         CcsrT->num_rows               = n;
4869:         CcsrT->num_cols               = m;
4870:         CcsrT->num_entries            = c->nz;

4872:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4873:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4874:         CcsrT->values         = new THRUSTARRAY(c->nz);

4876:         PetscCall(PetscLogGpuTimeBegin());
4877:         auto rT = CcsrT->row_offsets->begin();
4878:         if (AT) {
4879:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4880:           thrust::advance(rT, -1);
4881:         }
4882:         if (BT) {
4883:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4884:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4885:           thrust::copy(titb, tite, rT);
4886:         }
4887:         auto cT = CcsrT->column_indices->begin();
4888:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4889:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4890:         auto vT = CcsrT->values->begin();
4891:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4892:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4893:         PetscCall(PetscLogGpuTimeEnd());

4895:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4896:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4897:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4898:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4899:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4900:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4901:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4902:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4903:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4904: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4905:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4906:         PetscCallCUSPARSE(stat);
4907: #endif
4908:         Ccusp->matTranspose = CmatT;
4909:       }
4910:     }

4912:     c->free_a = PETSC_TRUE;
4913:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4914:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4915:     c->free_ij = PETSC_TRUE;
4916:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4917:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4918:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4919:       ii = *Ccsr->row_offsets;
4920:       jj = *Ccsr->column_indices;
4921:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4922:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923:     } else {
4924:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4925:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4926:     }
4927:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4928:     PetscCall(PetscMalloc1(m, &c->ilen));
4929:     PetscCall(PetscMalloc1(m, &c->imax));
4930:     c->maxnz         = c->nz;
4931:     c->nonzerorowcnt = 0;
4932:     c->rmax          = 0;
4933:     for (i = 0; i < m; i++) {
4934:       const PetscInt nn = c->i[i + 1] - c->i[i];
4935:       c->ilen[i] = c->imax[i] = nn;
4936:       c->nonzerorowcnt += (PetscInt)!!nn;
4937:       c->rmax = PetscMax(c->rmax, nn);
4938:     }
4939:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4940:     PetscCall(PetscMalloc1(c->nz, &c->a));
4941:     (*C)->nonzerostate++;
4942:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4943:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4944:     Ccusp->nonzerostate = (*C)->nonzerostate;
4945:     (*C)->preallocated  = PETSC_TRUE;
4946:   } else {
4947:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4948:     c = (Mat_SeqAIJ *)(*C)->data;
4949:     if (c->nz) {
4950:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4951:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4952:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4953:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4954:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4955:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4956:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4957:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4958:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4959:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4960:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4961:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4962:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4963:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4964:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4965:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4966:       auto pmid = Ccusp->coords->begin();
4967:       thrust::advance(pmid, Acsr->num_entries);
4968:       PetscCall(PetscLogGpuTimeBegin());
4969:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4970:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4971:       thrust::for_each(zibait, zieait, VecCUDAEquals());
4972:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4973:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4974:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4975:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4976:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4977:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4978:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4979:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4980:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4981:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4982:         auto       vT    = CcsrT->values->begin();
4983:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4984:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4985:         (*C)->transupdated = PETSC_TRUE;
4986:       }
4987:       PetscCall(PetscLogGpuTimeEnd());
4988:     }
4989:   }
4990:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4991:   (*C)->assembled     = PETSC_TRUE;
4992:   (*C)->was_assembled = PETSC_FALSE;
4993:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4994:   PetscFunctionReturn(PETSC_SUCCESS);
4995: }

4997: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4998: {
4999:   bool               dmem;
5000:   const PetscScalar *av;

5002:   PetscFunctionBegin;
5003:   dmem = isCudaMem(v);
5004:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5005:   if (n && idx) {
5006:     THRUSTINTARRAY widx(n);
5007:     widx.assign(idx, idx + n);
5008:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

5010:     THRUSTARRAY                    *w = NULL;
5011:     thrust::device_ptr<PetscScalar> dv;
5012:     if (dmem) {
5013:       dv = thrust::device_pointer_cast(v);
5014:     } else {
5015:       w  = new THRUSTARRAY(n);
5016:       dv = w->data();
5017:     }
5018:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

5020:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5021:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5022:     thrust::for_each(zibit, zieit, VecCUDAEquals());
5023:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5024:     delete w;
5025:   } else {
5026:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5027:   }
5028:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5029:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5030:   PetscFunctionReturn(PETSC_SUCCESS);
5031: }