Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18: #endif
 19: #include <thrust/iterator/constant_iterator.h>
 20: #include <thrust/remove.h>
 21: #include <thrust/sort.h>
 22: #include <thrust/unique.h>
 23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
 24:   #include <cuda/std/functional>
 25: #endif

 27: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
 28: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 29: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 30: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 31:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 33:   typedef enum {
 34:       CUSPARSE_MV_ALG_DEFAULT = 0,
 35:       CUSPARSE_COOMV_ALG      = 1,
 36:       CUSPARSE_CSRMV_ALG1     = 2,
 37:       CUSPARSE_CSRMV_ALG2     = 3
 38:   } cusparseSpMVAlg_t;

 40:   typedef enum {
 41:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 42:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 43:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 44:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 45:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 46:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 47:       CUSPARSE_SPMM_COO_ALG1    = 1,
 48:       CUSPARSE_SPMM_COO_ALG2    = 2,
 49:       CUSPARSE_SPMM_COO_ALG3    = 3,
 50:       CUSPARSE_SPMM_COO_ALG4    = 5,
 51:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 52:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 53:   } cusparseSpMMAlg_t;

 55:   typedef enum {
 56:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
 57:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
 58:   } cusparseCsr2CscAlg_t;
 59:   */
 60: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 61: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 62: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 63: #endif

 65: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 66: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 67: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 68: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 69: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
 70: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 71: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 72: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 74: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 75: #endif
 76: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 77: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 78: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 79: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 80: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 81: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 82: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 83: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 84: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 85: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 87: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 88: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 89: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 90: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 92: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 93: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 95: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 96: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 97: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 99: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100: {
101:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

103:   PetscFunctionBegin;
104:   switch (op) {
105:   case MAT_CUSPARSE_MULT:
106:     cusparsestruct->format = format;
107:     break;
108:   case MAT_CUSPARSE_ALL:
109:     cusparsestruct->format = format;
110:     break;
111:   default:
112:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113:   }
114:   PetscFunctionReturn(PETSC_SUCCESS);
115: }

117: /*@
118:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
119:   operation. Only the `MatMult()` operation can use different GPU storage formats

121:   Not Collective

123:   Input Parameters:
124: + A      - Matrix of type `MATSEQAIJCUSPARSE`
125: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
126:         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
127: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

129:   Level: intermediate

131: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
132: @*/
133: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
134: {
135:   PetscFunctionBegin;
137:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
138:   PetscFunctionReturn(PETSC_SUCCESS);
139: }

141: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142: {
143:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

145:   PetscFunctionBegin;
146:   cusparsestruct->use_cpu_solve = use_cpu;
147:   PetscFunctionReturn(PETSC_SUCCESS);
148: }

150: /*@
151:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

153:   Input Parameters:
154: + A       - Matrix of type `MATSEQAIJCUSPARSE`
155: - use_cpu - set flag for using the built-in CPU `MatSolve()`

157:   Level: intermediate

159:   Note:
160:   The cuSparse LU solver currently computes the factors with the built-in CPU method
161:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
162:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

164: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
165: @*/
166: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
167: {
168:   PetscFunctionBegin;
170:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
171:   PetscFunctionReturn(PETSC_SUCCESS);
172: }

174: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175: {
176:   PetscFunctionBegin;
177:   switch (op) {
178:   case MAT_FORM_EXPLICIT_TRANSPOSE:
179:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
180:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
181:     A->form_explicit_transpose = flg;
182:     break;
183:   default:
184:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
185:     break;
186:   }
187:   PetscFunctionReturn(PETSC_SUCCESS);
188: }

190: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
191: {
192:   MatCUSPARSEStorageFormat format;
193:   PetscBool                flg;
194:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

196:   PetscFunctionBegin;
197:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
198:   if (A->factortype == MAT_FACTOR_NONE) {
199:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

202:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
203:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
204:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
205:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
206: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
207:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
208:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
209:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
210:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211:   #else
212:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
213:   #endif
214:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
215:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

217:     PetscCall(
218:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
219:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
220: #endif
221:   }
222:   PetscOptionsHeadEnd();
223:   PetscFunctionReturn(PETSC_SUCCESS);
224: }

226: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
227: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
228: {
229:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
230:   PetscInt                      m  = A->rmap->n;
231:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
232:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
233:   const MatScalar              *Aa = a->a;
234:   PetscInt                     *Mi, *Mj, Mnz;
235:   PetscScalar                  *Ma;

237:   PetscFunctionBegin;
238:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
239:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
240:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
241:       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
242:       PetscCall(PetscMalloc1(m + 1, &Mi));
243:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
244:       PetscCall(PetscMalloc1(Mnz, &Ma));
245:       Mi[0] = 0;
246:       for (PetscInt i = 0; i < m; i++) {
247:         PetscInt llen = Ai[i + 1] - Ai[i];
248:         PetscInt ulen = Adiag[i] - Adiag[i + 1];
249:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
250:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
251:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
252:         Mi[i + 1] = Mi[i] + llen + ulen;
253:       }
254:       // Copy M (L,U) from host to device
255:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
256:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
257:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
258:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
259:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

261:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
262:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
263:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
264:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
265:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
266:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
267:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
268:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

270:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
271:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
272:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

274:       fillMode = CUSPARSE_FILL_MODE_UPPER;
275:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
276:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
277:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

280:       // Allocate work vectors in SpSv
281:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
282:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

284:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
285:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

287:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
288:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
289:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
290:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
291:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
292:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
293:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

295:       // Record for reuse
296:       fs->csrRowPtr_h = Mi;
297:       fs->csrVal_h    = Ma;
298:       PetscCall(PetscFree(Mj));
299:     }
300:     // Copy the value
301:     Mi  = fs->csrRowPtr_h;
302:     Ma  = fs->csrVal_h;
303:     Mnz = Mi[m];
304:     for (PetscInt i = 0; i < m; i++) {
305:       PetscInt llen = Ai[i + 1] - Ai[i];
306:       PetscInt ulen = Adiag[i] - Adiag[i + 1];
307:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
308:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
309:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
310:     }
311:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

313:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
314:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
315:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
316:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
318:     } else
319:   #endif
320:     {
321:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
322:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

324:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
325:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
326:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
327:     }
328:   }
329:   PetscFunctionReturn(PETSC_SUCCESS);
330: }
331: #else
332: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
333: {
334:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
335:   PetscInt                           n                  = A->rmap->n;
336:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
337:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
338:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
339:   const MatScalar                   *aa = a->a, *v;
340:   PetscInt                          *AiLo, *AjLo;
341:   PetscInt                           i, nz, nzLower, offset, rowOffset;

343:   PetscFunctionBegin;
344:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
345:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
346:     try {
347:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
348:       nzLower = n + ai[n] - ai[1];
349:       if (!loTriFactor) {
350:         PetscScalar *AALo;

352:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

354:         /* Allocate Space for the lower triangular matrix */
355:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
356:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

358:         /* Fill the lower triangular matrix */
359:         AiLo[0]   = (PetscInt)0;
360:         AiLo[n]   = nzLower;
361:         AjLo[0]   = (PetscInt)0;
362:         AALo[0]   = (MatScalar)1.0;
363:         v         = aa;
364:         vi        = aj;
365:         offset    = 1;
366:         rowOffset = 1;
367:         for (i = 1; i < n; i++) {
368:           nz = ai[i + 1] - ai[i];
369:           /* additional 1 for the term on the diagonal */
370:           AiLo[i] = rowOffset;
371:           rowOffset += nz + 1;

373:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
374:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));

376:           offset += nz;
377:           AjLo[offset] = (PetscInt)i;
378:           AALo[offset] = (MatScalar)1.0;
379:           offset += 1;

381:           v += nz;
382:           vi += nz;
383:         }

385:         /* allocate space for the triangular factor information */
386:         PetscCall(PetscNew(&loTriFactor));
387:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388:         /* Create the matrix description */
389:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
390:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
391:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
392:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
393:   #else
394:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
395:   #endif
396:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
397:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

399:         /* set the operation */
400:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

402:         /* set the matrix */
403:         loTriFactor->csrMat              = new CsrMatrix;
404:         loTriFactor->csrMat->num_rows    = n;
405:         loTriFactor->csrMat->num_cols    = n;
406:         loTriFactor->csrMat->num_entries = nzLower;

408:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
409:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

411:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

414:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

417:         /* Create the solve analysis information */
418:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
419:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
420:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
421:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
422:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
423:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
424:   #endif

426:         /* perform the solve analysis */
427:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
428:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
429:         PetscCallCUDA(WaitForCUDA());
430:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

432:         /* assign the pointer */
433:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
434:         loTriFactor->AA_h                                          = AALo;
435:         PetscCallCUDA(cudaFreeHost(AiLo));
436:         PetscCallCUDA(cudaFreeHost(AjLo));
437:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
438:       } else { /* update values only */
439:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
440:         /* Fill the lower triangular matrix */
441:         loTriFactor->AA_h[0] = 1.0;
442:         v                    = aa;
443:         vi                   = aj;
444:         offset               = 1;
445:         for (i = 1; i < n; i++) {
446:           nz = ai[i + 1] - ai[i];
447:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
448:           offset += nz;
449:           loTriFactor->AA_h[offset] = 1.0;
450:           offset += 1;
451:           v += nz;
452:         }
453:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
454:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
455:       }
456:     } catch (char *ex) {
457:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
458:     }
459:   }
460:   PetscFunctionReturn(PETSC_SUCCESS);
461: }

463: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
464: {
465:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
466:   PetscInt                           n                  = A->rmap->n;
467:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
468:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
469:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
470:   const MatScalar                   *aa = a->a, *v;
471:   PetscInt                          *AiUp, *AjUp;
472:   PetscInt                           i, nz, nzUpper, offset;

474:   PetscFunctionBegin;
475:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
476:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
477:     try {
478:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
479:       nzUpper = adiag[0] - adiag[n];
480:       if (!upTriFactor) {
481:         PetscScalar *AAUp;

483:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

485:         /* Allocate Space for the upper triangular matrix */
486:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
487:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

489:         /* Fill the upper triangular matrix */
490:         AiUp[0] = (PetscInt)0;
491:         AiUp[n] = nzUpper;
492:         offset  = nzUpper;
493:         for (i = n - 1; i >= 0; i--) {
494:           v  = aa + adiag[i + 1] + 1;
495:           vi = aj + adiag[i + 1] + 1;

497:           /* number of elements NOT on the diagonal */
498:           nz = adiag[i] - adiag[i + 1] - 1;

500:           /* decrement the offset */
501:           offset -= (nz + 1);

503:           /* first, set the diagonal elements */
504:           AjUp[offset] = (PetscInt)i;
505:           AAUp[offset] = (MatScalar)1. / v[nz];
506:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

508:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
509:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
510:         }

512:         /* allocate space for the triangular factor information */
513:         PetscCall(PetscNew(&upTriFactor));
514:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

516:         /* Create the matrix description */
517:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
518:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
519:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
520:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
521:   #else
522:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
523:   #endif
524:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
525:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

527:         /* set the operation */
528:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

530:         /* set the matrix */
531:         upTriFactor->csrMat              = new CsrMatrix;
532:         upTriFactor->csrMat->num_rows    = n;
533:         upTriFactor->csrMat->num_cols    = n;
534:         upTriFactor->csrMat->num_entries = nzUpper;

536:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
537:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

539:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
540:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

542:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
543:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

545:         /* Create the solve analysis information */
546:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
547:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
548:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
549:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
550:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
551:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
552:   #endif

554:         /* perform the solve analysis */
555:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
556:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

558:         PetscCallCUDA(WaitForCUDA());
559:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

561:         /* assign the pointer */
562:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
563:         upTriFactor->AA_h                                          = AAUp;
564:         PetscCallCUDA(cudaFreeHost(AiUp));
565:         PetscCallCUDA(cudaFreeHost(AjUp));
566:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
567:       } else {
568:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
569:         /* Fill the upper triangular matrix */
570:         offset = nzUpper;
571:         for (i = n - 1; i >= 0; i--) {
572:           v = aa + adiag[i + 1] + 1;

574:           /* number of elements NOT on the diagonal */
575:           nz = adiag[i] - adiag[i + 1] - 1;

577:           /* decrement the offset */
578:           offset -= (nz + 1);

580:           /* first, set the diagonal elements */
581:           upTriFactor->AA_h[offset] = 1. / v[nz];
582:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
583:         }
584:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
585:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
586:       }
587:     } catch (char *ex) {
588:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
589:     }
590:   }
591:   PetscFunctionReturn(PETSC_SUCCESS);
592: }
593: #endif

595: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
596: {
597:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
598:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
599:   IS                            isrow = a->row, isicol = a->icol;
600:   PetscBool                     row_identity, col_identity;
601:   PetscInt                      n = A->rmap->n;

603:   PetscFunctionBegin;
604:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
605: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
606:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
607: #else
608:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
609:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
610:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
611: #endif

613:   cusparseTriFactors->nnz = a->nz;

615:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
616:   /* lower triangular indices */
617:   PetscCall(ISIdentity(isrow, &row_identity));
618:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
619:     const PetscInt *r;

621:     PetscCall(ISGetIndices(isrow, &r));
622:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
623:     cusparseTriFactors->rpermIndices->assign(r, r + n);
624:     PetscCall(ISRestoreIndices(isrow, &r));
625:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
626:   }

628:   /* upper triangular indices */
629:   PetscCall(ISIdentity(isicol, &col_identity));
630:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
631:     const PetscInt *c;

633:     PetscCall(ISGetIndices(isicol, &c));
634:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
635:     cusparseTriFactors->cpermIndices->assign(c, c + n);
636:     PetscCall(ISRestoreIndices(isicol, &c));
637:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
638:   }
639:   PetscFunctionReturn(PETSC_SUCCESS);
640: }

642: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
643: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
644: {
645:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
646:   PetscInt                      m  = A->rmap->n;
647:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
648:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
649:   const MatScalar              *Aa = a->a;
650:   PetscInt                     *Mj, Mnz;
651:   PetscScalar                  *Ma, *D;

653:   PetscFunctionBegin;
654:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
655:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
656:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
657:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
658:       Mnz = Ai[m]; // Unz (with the unit diagonal)
659:       PetscCall(PetscMalloc1(Mnz, &Ma));
660:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
661:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
662:       for (PetscInt i = 0; i < m; i++) {
663:         PetscInt ulen = Ai[i + 1] - Ai[i];
664:         Mj[Ai[i]]     = i;                                              // diagonal entry
665:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
666:       }
667:       // Copy M (U) from host to device
668:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
669:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
670:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
671:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
672:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
673:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

675:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
676:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
677:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
678:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
679:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
680:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
681:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
682:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

684:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
685:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
686:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

688:       // Allocate work vectors in SpSv
689:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
690:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

692:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
693:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

695:       // Query buffer sizes for SpSV and then allocate buffers
696:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
697:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
698:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

700:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
701:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
702:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

704:       // Record for reuse
705:       fs->csrVal_h = Ma;
706:       fs->diag_h   = D;
707:       PetscCall(PetscFree(Mj));
708:     }
709:     // Copy the value
710:     Ma  = fs->csrVal_h;
711:     D   = fs->diag_h;
712:     Mnz = Ai[m];
713:     for (PetscInt i = 0; i < m; i++) {
714:       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
715:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
716:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
717:     }
718:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
719:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

721:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
722:     if (fs->updatedSpSVAnalysis) {
723:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
725:     } else
726:   #endif
727:     {
728:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
729:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
730:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
731:       fs->updatedSpSVAnalysis = PETSC_TRUE;
732:     }
733:   }
734:   PetscFunctionReturn(PETSC_SUCCESS);
735: }

737: // Solve Ut D U x = b
738: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
739: {
740:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
741:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
742:   const PetscScalar                    *barray;
743:   PetscScalar                          *xarray;
744:   thrust::device_ptr<const PetscScalar> bGPU;
745:   thrust::device_ptr<PetscScalar>       xGPU;
746:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
747:   PetscInt                              m   = A->rmap->n;

749:   PetscFunctionBegin;
750:   PetscCall(PetscLogGpuTimeBegin());
751:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
752:   PetscCall(VecCUDAGetArrayRead(b, &barray));
753:   xGPU = thrust::device_pointer_cast(xarray);
754:   bGPU = thrust::device_pointer_cast(barray);

756:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
757:   if (fs->rpermIndices) {
758:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
759:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
760:   } else {
761:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
762:   }

764:   // Solve Ut Y = X
765:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
766:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

768:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
769:   // It is basically a vector element-wise multiplication, but cublas does not have it!
770:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));

772:   // Solve U X = Y
773:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
774:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
775:   } else {
776:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
777:   }
778:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

780:   // Reorder X with the column permutation if needed, and put the result back to x
781:   if (fs->cpermIndices) {
782:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
783:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
784:   }

786:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
787:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
788:   PetscCall(PetscLogGpuTimeEnd());
789:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
790:   PetscFunctionReturn(PETSC_SUCCESS);
791: }
792: #else
793: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
794: {
795:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
796:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
797:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
798:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
799:   PetscInt                          *AiUp, *AjUp;
800:   PetscScalar                       *AAUp;
801:   PetscScalar                       *AALo;
802:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
803:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
804:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
805:   const MatScalar                   *aa = b->a, *v;

807:   PetscFunctionBegin;
808:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
809:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
810:     try {
811:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
812:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
813:       if (!upTriFactor && !loTriFactor) {
814:         /* Allocate Space for the upper triangular matrix */
815:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
816:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

818:         /* Fill the upper triangular matrix */
819:         AiUp[0] = (PetscInt)0;
820:         AiUp[n] = nzUpper;
821:         offset  = 0;
822:         for (i = 0; i < n; i++) {
823:           /* set the pointers */
824:           v  = aa + ai[i];
825:           vj = aj + ai[i];
826:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

828:           /* first, set the diagonal elements */
829:           AjUp[offset] = (PetscInt)i;
830:           AAUp[offset] = (MatScalar)1.0 / v[nz];
831:           AiUp[i]      = offset;
832:           AALo[offset] = (MatScalar)1.0 / v[nz];

834:           offset += 1;
835:           if (nz > 0) {
836:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
837:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
838:             for (j = offset; j < offset + nz; j++) {
839:               AAUp[j] = -AAUp[j];
840:               AALo[j] = AAUp[j] / v[nz];
841:             }
842:             offset += nz;
843:           }
844:         }

846:         /* allocate space for the triangular factor information */
847:         PetscCall(PetscNew(&upTriFactor));
848:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

850:         /* Create the matrix description */
851:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
852:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
853:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
854:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
855:   #else
856:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
857:   #endif
858:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
859:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

861:         /* set the matrix */
862:         upTriFactor->csrMat              = new CsrMatrix;
863:         upTriFactor->csrMat->num_rows    = A->rmap->n;
864:         upTriFactor->csrMat->num_cols    = A->cmap->n;
865:         upTriFactor->csrMat->num_entries = a->nz;

867:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
868:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

870:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
871:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

873:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
874:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

876:         /* set the operation */
877:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

879:         /* Create the solve analysis information */
880:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
882:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
884:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
885:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
886:   #endif

888:         /* perform the solve analysis */
889:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
890:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

892:         PetscCallCUDA(WaitForCUDA());
893:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

895:         /* assign the pointer */
896:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

898:         /* allocate space for the triangular factor information */
899:         PetscCall(PetscNew(&loTriFactor));
900:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

902:         /* Create the matrix description */
903:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
904:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
905:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
906:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
907:   #else
908:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
909:   #endif
910:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
911:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

913:         /* set the operation */
914:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

916:         /* set the matrix */
917:         loTriFactor->csrMat              = new CsrMatrix;
918:         loTriFactor->csrMat->num_rows    = A->rmap->n;
919:         loTriFactor->csrMat->num_cols    = A->cmap->n;
920:         loTriFactor->csrMat->num_entries = a->nz;

922:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
923:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

925:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
926:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

928:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
929:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

931:         /* Create the solve analysis information */
932:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
933:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
934:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
935:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
936:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
937:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
938:   #endif

940:         /* perform the solve analysis */
941:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
942:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

944:         PetscCallCUDA(WaitForCUDA());
945:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

947:         /* assign the pointer */
948:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

950:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
951:         PetscCallCUDA(cudaFreeHost(AiUp));
952:         PetscCallCUDA(cudaFreeHost(AjUp));
953:       } else {
954:         /* Fill the upper triangular matrix */
955:         offset = 0;
956:         for (i = 0; i < n; i++) {
957:           /* set the pointers */
958:           v  = aa + ai[i];
959:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

961:           /* first, set the diagonal elements */
962:           AAUp[offset] = 1.0 / v[nz];
963:           AALo[offset] = 1.0 / v[nz];

965:           offset += 1;
966:           if (nz > 0) {
967:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
968:             for (j = offset; j < offset + nz; j++) {
969:               AAUp[j] = -AAUp[j];
970:               AALo[j] = AAUp[j] / v[nz];
971:             }
972:             offset += nz;
973:           }
974:         }
975:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
978:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
979:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
980:       }
981:       PetscCallCUDA(cudaFreeHost(AAUp));
982:       PetscCallCUDA(cudaFreeHost(AALo));
983:     } catch (char *ex) {
984:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
985:     }
986:   }
987:   PetscFunctionReturn(PETSC_SUCCESS);
988: }
989: #endif

991: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
992: {
993:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
994:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
995:   IS                            ip                 = a->row;
996:   PetscBool                     perm_identity;
997:   PetscInt                      n = A->rmap->n;

999:   PetscFunctionBegin;
1000:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

1002: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1003:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1004: #else
1005:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1006:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1007: #endif
1008:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

1010:   A->offloadmask = PETSC_OFFLOAD_BOTH;

1012:   /* lower triangular indices */
1013:   PetscCall(ISIdentity(ip, &perm_identity));
1014:   if (!perm_identity) {
1015:     IS              iip;
1016:     const PetscInt *irip, *rip;

1018:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1019:     PetscCall(ISGetIndices(iip, &irip));
1020:     PetscCall(ISGetIndices(ip, &rip));
1021:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1022:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1023:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1024:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1025:     PetscCall(ISRestoreIndices(iip, &irip));
1026:     PetscCall(ISDestroy(&iip));
1027:     PetscCall(ISRestoreIndices(ip, &rip));
1028:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1029:   }
1030:   PetscFunctionReturn(PETSC_SUCCESS);
1031: }

1033: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1034: {
1035:   PetscFunctionBegin;
1036:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1037:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1038:   B->offloadmask = PETSC_OFFLOAD_CPU;

1040: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1041:   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042:   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1043: #else
1044:   /* determine which version of MatSolve needs to be used. */
1045:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1046:   IS          ip = b->row;
1047:   PetscBool   perm_identity;

1049:   PetscCall(ISIdentity(ip, &perm_identity));
1050:   if (perm_identity) {
1051:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1052:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1053:   } else {
1054:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1055:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1056:   }
1057: #endif
1058:   B->ops->matsolve          = NULL;
1059:   B->ops->matsolvetranspose = NULL;

1061:   /* get the triangular factors */
1062:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1063:   PetscFunctionReturn(PETSC_SUCCESS);
1064: }

1066: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1067: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1068: {
1069:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1070:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1071:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1072:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1073:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1074:   cusparseIndexBase_t                indexBase;
1075:   cusparseMatrixType_t               matrixType;
1076:   cusparseFillMode_t                 fillMode;
1077:   cusparseDiagType_t                 diagType;

1079:   PetscFunctionBegin;
1080:   /* allocate space for the transpose of the lower triangular factor */
1081:   PetscCall(PetscNew(&loTriFactorT));
1082:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1084:   /* set the matrix descriptors of the lower triangular factor */
1085:   matrixType = cusparseGetMatType(loTriFactor->descr);
1086:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1087:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1088:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

1090:   /* Create the matrix description */
1091:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1092:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1093:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1094:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1095:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

1097:   /* set the operation */
1098:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1100:   /* allocate GPU space for the CSC of the lower triangular factor*/
1101:   loTriFactorT->csrMat                 = new CsrMatrix;
1102:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1103:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1104:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1105:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1106:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1107:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1109:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1110:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1111:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1112:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1113:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1114:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1115:   #endif

1117:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1118:   {
1119:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1120:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1121:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1122:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1123:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1124:   #else
1125:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1126:   #endif
1127:     PetscCallCUSPARSE(stat);
1128:   }

1130:   PetscCallCUDA(WaitForCUDA());
1131:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1133:   /* Create the solve analysis information */
1134:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1135:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1136:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1137:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1138:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1139:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1140:   #endif

1142:   /* perform the solve analysis */
1143:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1144:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1146:   PetscCallCUDA(WaitForCUDA());
1147:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1149:   /* assign the pointer */
1150:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1152:   /*********************************************/
1153:   /* Now the Transpose of the Upper Tri Factor */
1154:   /*********************************************/

1156:   /* allocate space for the transpose of the upper triangular factor */
1157:   PetscCall(PetscNew(&upTriFactorT));
1158:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1160:   /* set the matrix descriptors of the upper triangular factor */
1161:   matrixType = cusparseGetMatType(upTriFactor->descr);
1162:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1163:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1164:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

1166:   /* Create the matrix description */
1167:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1168:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1169:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1170:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1171:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

1173:   /* set the operation */
1174:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1176:   /* allocate GPU space for the CSC of the upper triangular factor*/
1177:   upTriFactorT->csrMat                 = new CsrMatrix;
1178:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1179:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1180:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1181:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1182:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1183:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1185:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1186:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1187:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1188:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1189:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1190:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1191:   #endif

1193:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1194:   {
1195:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1196:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1197:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1198:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1199:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1200:   #else
1201:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1202:   #endif
1203:     PetscCallCUSPARSE(stat);
1204:   }

1206:   PetscCallCUDA(WaitForCUDA());
1207:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1209:   /* Create the solve analysis information */
1210:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1211:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1212:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1213:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1214:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1215:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1216:   #endif

1218:   /* perform the solve analysis */
1219:   /* christ, would it have killed you to put this stuff in a function????????? */
1220:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1223:   PetscCallCUDA(WaitForCUDA());
1224:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1226:   /* assign the pointer */
1227:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1228:   PetscFunctionReturn(PETSC_SUCCESS);
1229: }
1230: #endif

1232: struct PetscScalarToPetscInt {
1233:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1234: };

1236: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1237: {
1238:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1239:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1240:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1241:   cusparseStatus_t              stat;
1242:   cusparseIndexBase_t           indexBase;

1244:   PetscFunctionBegin;
1245:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1246:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1247:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1248:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1249:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1250:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1251:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1252:   PetscCall(PetscLogGpuTimeBegin());
1253:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1254:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1255:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1256:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1257:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1259:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1261:     /* set alpha and beta */
1262:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1263:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1264:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1265:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1267:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1269:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1270:       CsrMatrix *matrixT      = new CsrMatrix;
1271:       matstructT->mat         = matrixT;
1272:       matrixT->num_rows       = A->cmap->n;
1273:       matrixT->num_cols       = A->rmap->n;
1274:       matrixT->num_entries    = a->nz;
1275:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1276:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1277:       matrixT->values         = new THRUSTARRAY(a->nz);

1279:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1280:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1282: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1284:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1285:                                indexBase, cusparse_scalartype);
1286:       PetscCallCUSPARSE(stat);
1287:   #else
1288:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1289:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1291:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1292:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1293:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1294:         */
1295:       if (matrixT->num_entries) {
1296:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1297:         PetscCallCUSPARSE(stat);

1299:       } else {
1300:         matstructT->matDescr = NULL;
1301:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1302:       }
1303:   #endif
1304: #endif
1305:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1306: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1307:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1308: #else
1309:       CsrMatrix *temp  = new CsrMatrix;
1310:       CsrMatrix *tempT = new CsrMatrix;
1311:       /* First convert HYB to CSR */
1312:       temp->num_rows       = A->rmap->n;
1313:       temp->num_cols       = A->cmap->n;
1314:       temp->num_entries    = a->nz;
1315:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1316:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1317:       temp->values         = new THRUSTARRAY(a->nz);

1319:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1320:       PetscCallCUSPARSE(stat);

1322:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1323:       tempT->num_rows       = A->rmap->n;
1324:       tempT->num_cols       = A->cmap->n;
1325:       tempT->num_entries    = a->nz;
1326:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1327:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1328:       tempT->values         = new THRUSTARRAY(a->nz);

1330:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1331:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1332:       PetscCallCUSPARSE(stat);

1334:       /* Last, convert CSC to HYB */
1335:       cusparseHybMat_t hybMat;
1336:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1337:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1338:       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1339:       PetscCallCUSPARSE(stat);

1341:       /* assign the pointer */
1342:       matstructT->mat = hybMat;
1343:       A->transupdated = PETSC_TRUE;
1344:       /* delete temporaries */
1345:       if (tempT) {
1346:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1347:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1348:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1349:         delete (CsrMatrix *)tempT;
1350:       }
1351:       if (temp) {
1352:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1353:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1354:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1355:         delete (CsrMatrix *)temp;
1356:       }
1357: #endif
1358:     }
1359:   }
1360:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1361:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1362:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1363:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1364:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1365:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1366:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1367:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1368:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1369:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1370:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1371:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1372:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1373:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1374:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1375:     }
1376:     if (!cusparsestruct->csr2csc_i) {
1377:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1378:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1380:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1381: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1382:       void  *csr2cscBuffer;
1383:       size_t csr2cscBufferSize;
1384:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1385:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1386:       PetscCallCUSPARSE(stat);
1387:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1388: #endif

1390:       if (matrix->num_entries) {
1391:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1392:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1393:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1395:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1396:            should be filled with indexBase. So I just take a shortcut here.
1397:         */
1398:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1399: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1400:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1401:         PetscCallCUSPARSE(stat);
1402: #else
1403:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1404:         PetscCallCUSPARSE(stat);
1405: #endif
1406:       } else {
1407:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1408:       }

1410:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1411:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1412: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1413:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1414: #endif
1415:     }
1416:     PetscCallThrust(
1417:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1418:   }
1419:   PetscCall(PetscLogGpuTimeEnd());
1420:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1421:   /* the compressed row indices is not used for matTranspose */
1422:   matstructT->cprowIndices = NULL;
1423:   /* assign the pointer */
1424:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1425:   A->transupdated                                = PETSC_TRUE;
1426:   PetscFunctionReturn(PETSC_SUCCESS);
1427: }

1429: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1430: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1431: {
1432:   const PetscScalar                    *barray;
1433:   PetscScalar                          *xarray;
1434:   thrust::device_ptr<const PetscScalar> bGPU;
1435:   thrust::device_ptr<PetscScalar>       xGPU;
1436:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1437:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1438:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1440:   PetscInt                              m   = A->rmap->n;

1442:   PetscFunctionBegin;
1443:   PetscCall(PetscLogGpuTimeBegin());
1444:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1445:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1446:   xGPU = thrust::device_pointer_cast(xarray);
1447:   bGPU = thrust::device_pointer_cast(barray);

1449:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1450:   if (fs->rpermIndices) {
1451:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1452:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1453:   } else {
1454:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1455:   }

1457:   // Solve L Y = X
1458:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1459:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1460:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

1462:   // Solve U X = Y
1463:   if (fs->cpermIndices) {
1464:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1465:   } else {
1466:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1467:   }
1468:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

1470:   // Reorder X with the column permutation if needed, and put the result back to x
1471:   if (fs->cpermIndices) {
1472:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1473:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1474:   }
1475:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1476:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1477:   PetscCall(PetscLogGpuTimeEnd());
1478:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1479:   PetscFunctionReturn(PETSC_SUCCESS);
1480: }

1482: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1483: {
1484:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1485:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1486:   const PetscScalar                    *barray;
1487:   PetscScalar                          *xarray;
1488:   thrust::device_ptr<const PetscScalar> bGPU;
1489:   thrust::device_ptr<PetscScalar>       xGPU;
1490:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1491:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1492:   PetscInt                              m   = A->rmap->n;

1494:   PetscFunctionBegin;
1495:   PetscCall(PetscLogGpuTimeBegin());
1496:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1497:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1498:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1499:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1501:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1502:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1503:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1504:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1505:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1506:   }

1508:   if (!fs->updatedTransposeSpSVAnalysis) {
1509:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1511:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1512:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1513:   }

1515:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1516:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1517:   xGPU = thrust::device_pointer_cast(xarray);
1518:   bGPU = thrust::device_pointer_cast(barray);

1520:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1521:   if (fs->rpermIndices) {
1522:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1523:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1524:   } else {
1525:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1526:   }

1528:   // Solve Ut Y = X
1529:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1530:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

1532:   // Solve Lt X = Y
1533:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1534:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1535:   } else {
1536:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1537:   }
1538:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

1540:   // Reorder X with the column permutation if needed, and put the result back to x
1541:   if (fs->cpermIndices) {
1542:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1543:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1544:   }

1546:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1547:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1548:   PetscCall(PetscLogGpuTimeEnd());
1549:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1550:   PetscFunctionReturn(PETSC_SUCCESS);
1551: }
1552: #else
1553: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1554: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1555: {
1556:   PetscInt                              n = xx->map->n;
1557:   const PetscScalar                    *barray;
1558:   PetscScalar                          *xarray;
1559:   thrust::device_ptr<const PetscScalar> bGPU;
1560:   thrust::device_ptr<PetscScalar>       xGPU;
1561:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1562:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1563:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1564:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1566:   PetscFunctionBegin;
1567:   /* Analyze the matrix and create the transpose ... on the fly */
1568:   if (!loTriFactorT && !upTriFactorT) {
1569:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1570:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1571:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1572:   }

1574:   /* Get the GPU pointers */
1575:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1576:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1577:   xGPU = thrust::device_pointer_cast(xarray);
1578:   bGPU = thrust::device_pointer_cast(barray);

1580:   PetscCall(PetscLogGpuTimeBegin());
1581:   /* First, reorder with the row permutation */
1582:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1584:   /* First, solve U */
1585:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1586:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1588:   /* Then, solve L */
1589:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1590:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1592:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1593:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1595:   /* Copy the temporary to the full solution. */
1596:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1598:   /* restore */
1599:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1600:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1601:   PetscCall(PetscLogGpuTimeEnd());
1602:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1603:   PetscFunctionReturn(PETSC_SUCCESS);
1604: }

1606: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1607: {
1608:   const PetscScalar                 *barray;
1609:   PetscScalar                       *xarray;
1610:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1611:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1612:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1613:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1615:   PetscFunctionBegin;
1616:   /* Analyze the matrix and create the transpose ... on the fly */
1617:   if (!loTriFactorT && !upTriFactorT) {
1618:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1619:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1620:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1621:   }

1623:   /* Get the GPU pointers */
1624:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1625:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1627:   PetscCall(PetscLogGpuTimeBegin());
1628:   /* First, solve U */
1629:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1630:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1632:   /* Then, solve L */
1633:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1634:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1636:   /* restore */
1637:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1638:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1639:   PetscCall(PetscLogGpuTimeEnd());
1640:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1641:   PetscFunctionReturn(PETSC_SUCCESS);
1642: }

1644: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1645: {
1646:   const PetscScalar                    *barray;
1647:   PetscScalar                          *xarray;
1648:   thrust::device_ptr<const PetscScalar> bGPU;
1649:   thrust::device_ptr<PetscScalar>       xGPU;
1650:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1651:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1652:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1653:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1655:   PetscFunctionBegin;
1656:   /* Get the GPU pointers */
1657:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1658:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1659:   xGPU = thrust::device_pointer_cast(xarray);
1660:   bGPU = thrust::device_pointer_cast(barray);

1662:   PetscCall(PetscLogGpuTimeBegin());
1663:   /* First, reorder with the row permutation */
1664:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1666:   /* Next, solve L */
1667:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1668:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1670:   /* Then, solve U */
1671:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1672:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1674:   /* Last, reorder with the column permutation */
1675:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1677:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1678:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1679:   PetscCall(PetscLogGpuTimeEnd());
1680:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1681:   PetscFunctionReturn(PETSC_SUCCESS);
1682: }

1684: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1685: {
1686:   const PetscScalar                 *barray;
1687:   PetscScalar                       *xarray;
1688:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1689:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1690:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1691:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1693:   PetscFunctionBegin;
1694:   /* Get the GPU pointers */
1695:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1696:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1698:   PetscCall(PetscLogGpuTimeBegin());
1699:   /* First, solve L */
1700:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1701:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1703:   /* Next, solve U */
1704:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1705:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1707:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1708:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1709:   PetscCall(PetscLogGpuTimeEnd());
1710:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1711:   PetscFunctionReturn(PETSC_SUCCESS);
1712: }
1713: #endif

1715: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1716: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1717: {
1718:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1719:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1720:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1721:   CsrMatrix                    *Acsr;
1722:   PetscInt                      m, nz;
1723:   PetscBool                     flg;

1725:   PetscFunctionBegin;
1726:   if (PetscDefined(USE_DEBUG)) {
1727:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1728:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1729:   }

1731:   /* Copy A's value to fact */
1732:   m  = fact->rmap->n;
1733:   nz = aij->nz;
1734:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1735:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1736:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1738:   PetscCall(PetscLogGpuTimeBegin());
1739:   /* Factorize fact inplace */
1740:   if (m)
1741:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1742:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1743:   if (PetscDefined(USE_DEBUG)) {
1744:     int              numerical_zero;
1745:     cusparseStatus_t status;
1746:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1747:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1748:   }

1750:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1751:   if (fs->updatedSpSVAnalysis) {
1752:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1754:   } else
1755:   #endif
1756:   {
1757:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1758:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1759:     */
1760:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1762:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1764:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1765:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1766:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1767:   }

1769:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1770:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1771:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1772:   fact->ops->matsolve          = NULL;
1773:   fact->ops->matsolvetranspose = NULL;
1774:   PetscCall(PetscLogGpuTimeEnd());
1775:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1776:   PetscFunctionReturn(PETSC_SUCCESS);
1777: }

1779: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1780: {
1781:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1782:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1783:   PetscInt                      m, nz;

1785:   PetscFunctionBegin;
1786:   if (PetscDefined(USE_DEBUG)) {
1787:     PetscInt  i;
1788:     PetscBool flg, missing;

1790:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1791:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1792:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1793:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1794:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1795:   }

1797:   /* Free the old stale stuff */
1798:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1800:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1801:      but they will not be used. Allocate them just for easy debugging.
1802:    */
1803:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1805:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1806:   fact->factortype             = MAT_FACTOR_ILU;
1807:   fact->info.factor_mallocs    = 0;
1808:   fact->info.fill_ratio_given  = info->fill;
1809:   fact->info.fill_ratio_needed = 1.0;

1811:   aij->row = NULL;
1812:   aij->col = NULL;

1814:   /* ====================================================================== */
1815:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1816:   /* We'll do in-place factorization on fact                                */
1817:   /* ====================================================================== */
1818:   const int *Ai, *Aj;

1820:   m  = fact->rmap->n;
1821:   nz = aij->nz;

1823:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1824:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1825:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1826:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1827:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1830:   /* ====================================================================== */
1831:   /* Create descriptors for M, L, U                                         */
1832:   /* ====================================================================== */
1833:   cusparseFillMode_t fillMode;
1834:   cusparseDiagType_t diagType;

1836:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1837:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1838:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1840:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1841:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1842:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1843:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1844:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1845:   */
1846:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1847:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1848:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1849:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1850:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1852:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1853:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1854:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1855:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1856:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1858:   /* ========================================================================= */
1859:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1860:   /* ========================================================================= */
1861:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1862:   if (m)
1863:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1864:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

1866:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1867:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1869:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1870:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1872:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1873:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1875:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1876:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1878:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1879:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1880:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1881:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1882:    */
1883:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1884:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1885:     fs->spsvBuffer_L = fs->factBuffer_M;
1886:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1887:   } else {
1888:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1889:     fs->spsvBuffer_U = fs->factBuffer_M;
1890:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1891:   }

1893:   /* ========================================================================== */
1894:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1895:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1896:   /* ========================================================================== */
1897:   int              structural_zero;
1898:   cusparseStatus_t status;

1900:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1901:   if (m)
1902:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1903:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1904:   if (PetscDefined(USE_DEBUG)) {
1905:     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1906:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1907:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1908:   }

1910:   /* Estimate FLOPs of the numeric factorization */
1911:   {
1912:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1913:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1914:     PetscLogDouble flops = 0.0;

1916:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1917:     Ai    = Aseq->i;
1918:     Adiag = Aseq->diag;
1919:     for (PetscInt i = 0; i < m; i++) {
1920:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1921:         nzRow  = Ai[i + 1] - Ai[i];
1922:         nzLeft = Adiag[i] - Ai[i];
1923:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1924:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1925:         */
1926:         nzLeft = (nzRow - 1) / 2;
1927:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1928:       }
1929:     }
1930:     fs->numericFactFlops = flops;
1931:   }
1932:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1933:   PetscFunctionReturn(PETSC_SUCCESS);
1934: }

1936: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1937: {
1938:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1940:   const PetscScalar            *barray;
1941:   PetscScalar                  *xarray;

1943:   PetscFunctionBegin;
1944:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1945:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1946:   PetscCall(PetscLogGpuTimeBegin());

1948:   /* Solve L*y = b */
1949:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1950:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1951:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1952:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1954:   /* Solve Lt*x = y */
1955:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1956:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1957:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1959:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1960:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1962:   PetscCall(PetscLogGpuTimeEnd());
1963:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1964:   PetscFunctionReturn(PETSC_SUCCESS);
1965: }

1967: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1968: {
1969:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1970:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1971:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1972:   CsrMatrix                    *Acsr;
1973:   PetscInt                      m, nz;
1974:   PetscBool                     flg;

1976:   PetscFunctionBegin;
1977:   if (PetscDefined(USE_DEBUG)) {
1978:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1979:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1980:   }

1982:   /* Copy A's value to fact */
1983:   m  = fact->rmap->n;
1984:   nz = aij->nz;
1985:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1986:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1987:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1989:   /* Factorize fact inplace */
1990:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1991:      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1992:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1993:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1994:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1995:    */
1996:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1997:   if (PetscDefined(USE_DEBUG)) {
1998:     int              numerical_zero;
1999:     cusparseStatus_t status;
2000:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2001:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2002:   }

2004:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2005:   if (fs->updatedSpSVAnalysis) {
2006:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2008:   } else
2009:   #endif
2010:   {
2011:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

2013:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2014:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2015:   */
2016:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2017:     fs->updatedSpSVAnalysis = PETSC_TRUE;
2018:   }

2020:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2021:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2022:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2023:   fact->ops->matsolve          = NULL;
2024:   fact->ops->matsolvetranspose = NULL;
2025:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2026:   PetscFunctionReturn(PETSC_SUCCESS);
2027: }

2029: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2030: {
2031:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2032:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2033:   PetscInt                      m, nz;

2035:   PetscFunctionBegin;
2036:   if (PetscDefined(USE_DEBUG)) {
2037:     PetscInt  i;
2038:     PetscBool flg, missing;

2040:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2041:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2042:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2043:     PetscCall(MatMissingDiagonal(A, &missing, &i));
2044:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2045:   }

2047:   /* Free the old stale stuff */
2048:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

2050:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2051:      but they will not be used. Allocate them just for easy debugging.
2052:    */
2053:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

2055:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2056:   fact->factortype             = MAT_FACTOR_ICC;
2057:   fact->info.factor_mallocs    = 0;
2058:   fact->info.fill_ratio_given  = info->fill;
2059:   fact->info.fill_ratio_needed = 1.0;

2061:   aij->row = NULL;
2062:   aij->col = NULL;

2064:   /* ====================================================================== */
2065:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2066:   /* We'll do in-place factorization on fact                                */
2067:   /* ====================================================================== */
2068:   const int *Ai, *Aj;

2070:   m  = fact->rmap->n;
2071:   nz = aij->nz;

2073:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2074:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2075:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2076:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2077:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

2080:   /* ====================================================================== */
2081:   /* Create mat descriptors for M, L                                        */
2082:   /* ====================================================================== */
2083:   cusparseFillMode_t fillMode;
2084:   cusparseDiagType_t diagType;

2086:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2087:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2088:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

2090:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2091:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2092:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2093:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2094:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2095:   */
2096:   fillMode = CUSPARSE_FILL_MODE_LOWER;
2097:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2098:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2099:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2100:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

2102:   /* ========================================================================= */
2103:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2104:   /* ========================================================================= */
2105:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2106:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

2108:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2109:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

2111:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2112:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

2114:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2115:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

2117:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2118:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

2120:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2121:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2122:    */
2123:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2124:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2125:     fs->spsvBuffer_L = fs->factBuffer_M;
2126:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2127:   } else {
2128:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2129:     fs->spsvBuffer_Lt = fs->factBuffer_M;
2130:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2131:   }

2133:   /* ========================================================================== */
2134:   /* Perform analysis of ic0 on M                                               */
2135:   /* The lower triangular part of M has the same sparsity pattern as L          */
2136:   /* ========================================================================== */
2137:   int              structural_zero;
2138:   cusparseStatus_t status;

2140:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2141:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2142:   if (PetscDefined(USE_DEBUG)) {
2143:     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2144:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2145:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2146:   }

2148:   /* Estimate FLOPs of the numeric factorization */
2149:   {
2150:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2151:     PetscInt      *Ai, nzRow, nzLeft;
2152:     PetscLogDouble flops = 0.0;

2154:     Ai = Aseq->i;
2155:     for (PetscInt i = 0; i < m; i++) {
2156:       nzRow = Ai[i + 1] - Ai[i];
2157:       if (nzRow > 1) {
2158:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2159:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2160:         */
2161:         nzLeft = (nzRow - 1) / 2;
2162:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2163:       }
2164:     }
2165:     fs->numericFactFlops = flops;
2166:   }
2167:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2168:   PetscFunctionReturn(PETSC_SUCCESS);
2169: }
2170: #endif

2172: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2173: {
2174:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2175:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

2177:   PetscFunctionBegin;
2178:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2179:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2180:   B->offloadmask = PETSC_OFFLOAD_CPU;

2182:   if (!cusparsestruct->use_cpu_solve) {
2183: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2184:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2185:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2186: #else
2187:     /* determine which version of MatSolve needs to be used. */
2188:     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2189:     IS          isrow = b->row, iscol = b->col;
2190:     PetscBool   row_identity, col_identity;

2192:     PetscCall(ISIdentity(isrow, &row_identity));
2193:     PetscCall(ISIdentity(iscol, &col_identity));
2194:     if (row_identity && col_identity) {
2195:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2196:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2197:     } else {
2198:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2199:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2200:     }
2201: #endif
2202:   }
2203:   B->ops->matsolve          = NULL;
2204:   B->ops->matsolvetranspose = NULL;

2206:   /* get the triangular factors */
2207:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2208:   PetscFunctionReturn(PETSC_SUCCESS);
2209: }

2211: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2212: {
2213:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

2215:   PetscFunctionBegin;
2216:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219:   PetscFunctionReturn(PETSC_SUCCESS);
2220: }

2222: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2223: {
2224:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2226:   PetscFunctionBegin;
2227: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2229:   if (!info->factoronhost) {
2230:     PetscCall(ISIdentity(isrow, &row_identity));
2231:     PetscCall(ISIdentity(iscol, &col_identity));
2232:   }
2233:   if (!info->levels && row_identity && col_identity) {
2234:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2235:   } else
2236: #endif
2237:   {
2238:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2239:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2240:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2241:   }
2242:   PetscFunctionReturn(PETSC_SUCCESS);
2243: }

2245: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2246: {
2247:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2249:   PetscFunctionBegin;
2250: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2251:   PetscBool perm_identity = PETSC_FALSE;
2252:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2253:   if (!info->levels && perm_identity) {
2254:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2255:   } else
2256: #endif
2257:   {
2258:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2259:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2260:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2261:   }
2262:   PetscFunctionReturn(PETSC_SUCCESS);
2263: }

2265: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2266: {
2267:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2269:   PetscFunctionBegin;
2270:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2271:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2272:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2273:   PetscFunctionReturn(PETSC_SUCCESS);
2274: }

2276: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2277: {
2278:   PetscFunctionBegin;
2279:   *type = MATSOLVERCUSPARSE;
2280:   PetscFunctionReturn(PETSC_SUCCESS);
2281: }

2283: /*MC
2284:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2285:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2286:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2287:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2288:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2289:   algorithms are not recommended. This class does NOT support direct solver operations.

2291:   Level: beginner

2293: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2294:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2295: M*/

2297: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2298: {
2299:   PetscInt n = A->rmap->n;

2301:   PetscFunctionBegin;
2302:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2303:   PetscCall(MatSetSizes(*B, n, n, n, n));
2304:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2305:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

2307:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2308:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2309:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2310:     if (!A->boundtocpu) {
2311:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2312:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2313:     } else {
2314:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2315:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2316:     }
2317:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2318:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2319:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2320:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2321:     if (!A->boundtocpu) {
2322:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2323:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2324:     } else {
2325:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2326:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2327:     }
2328:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2329:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2330:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2332:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2333:   (*B)->canuseordering = PETSC_TRUE;
2334:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2335:   PetscFunctionReturn(PETSC_SUCCESS);
2336: }

2338: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2339: {
2340:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2341:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2342: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2343:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2344: #endif

2346:   PetscFunctionBegin;
2347:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2348:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2349:     if (A->factortype == MAT_FACTOR_NONE) {
2350:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2351:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2352:     }
2353: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354:     else if (fs->csrVal) {
2355:       /* We have a factorized matrix on device and are able to copy it to host */
2356:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2357:     }
2358: #endif
2359:     else
2360:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2361:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2362:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2363:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2364:   }
2365:   PetscFunctionReturn(PETSC_SUCCESS);
2366: }

2368: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2369: {
2370:   PetscFunctionBegin;
2371:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2372:   *array = ((Mat_SeqAIJ *)A->data)->a;
2373:   PetscFunctionReturn(PETSC_SUCCESS);
2374: }

2376: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377: {
2378:   PetscFunctionBegin;
2379:   A->offloadmask = PETSC_OFFLOAD_CPU;
2380:   *array         = NULL;
2381:   PetscFunctionReturn(PETSC_SUCCESS);
2382: }

2384: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2385: {
2386:   PetscFunctionBegin;
2387:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2388:   *array = ((Mat_SeqAIJ *)A->data)->a;
2389:   PetscFunctionReturn(PETSC_SUCCESS);
2390: }

2392: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2393: {
2394:   PetscFunctionBegin;
2395:   *array = NULL;
2396:   PetscFunctionReturn(PETSC_SUCCESS);
2397: }

2399: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2400: {
2401:   PetscFunctionBegin;
2402:   *array = ((Mat_SeqAIJ *)A->data)->a;
2403:   PetscFunctionReturn(PETSC_SUCCESS);
2404: }

2406: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2407: {
2408:   PetscFunctionBegin;
2409:   A->offloadmask = PETSC_OFFLOAD_CPU;
2410:   *array         = NULL;
2411:   PetscFunctionReturn(PETSC_SUCCESS);
2412: }

2414: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2415: {
2416:   Mat_SeqAIJCUSPARSE *cusp;
2417:   CsrMatrix          *matrix;

2419:   PetscFunctionBegin;
2420:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2421:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2422:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2423:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2424:   matrix = (CsrMatrix *)cusp->mat->mat;

2426:   if (i) {
2427: #if !defined(PETSC_USE_64BIT_INDICES)
2428:     *i = matrix->row_offsets->data().get();
2429: #else
2430:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2431: #endif
2432:   }
2433:   if (j) {
2434: #if !defined(PETSC_USE_64BIT_INDICES)
2435:     *j = matrix->column_indices->data().get();
2436: #else
2437:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2438: #endif
2439:   }
2440:   if (a) *a = matrix->values->data().get();
2441:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2442:   PetscFunctionReturn(PETSC_SUCCESS);
2443: }

2445: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2446: {
2447:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2448:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2449:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2450:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2451:   cusparseStatus_t              stat;
2452:   PetscBool                     both = PETSC_TRUE;

2454:   PetscFunctionBegin;
2455:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2456:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2457:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2458:       CsrMatrix *matrix;
2459:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2461:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2462:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2463:       matrix->values->assign(a->a, a->a + a->nz);
2464:       PetscCallCUDA(WaitForCUDA());
2465:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2466:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2467:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2468:     } else {
2469:       PetscInt nnz;
2470:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2471:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2472:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2473:       delete cusparsestruct->workVector;
2474:       delete cusparsestruct->rowoffsets_gpu;
2475:       cusparsestruct->workVector     = NULL;
2476:       cusparsestruct->rowoffsets_gpu = NULL;
2477:       try {
2478:         if (a->compressedrow.use) {
2479:           m    = a->compressedrow.nrows;
2480:           ii   = a->compressedrow.i;
2481:           ridx = a->compressedrow.rindex;
2482:         } else {
2483:           m    = A->rmap->n;
2484:           ii   = a->i;
2485:           ridx = NULL;
2486:         }
2487:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2488:         if (!a->a) {
2489:           nnz  = ii[m];
2490:           both = PETSC_FALSE;
2491:         } else nnz = a->nz;
2492:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2494:         /* create cusparse matrix */
2495:         cusparsestruct->nrows = m;
2496:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2497:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2498:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2499:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2501:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2502:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2503:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2504:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2507:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2509:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2510:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2511:           /* set the matrix */
2512:           CsrMatrix *mat   = new CsrMatrix;
2513:           mat->num_rows    = m;
2514:           mat->num_cols    = A->cmap->n;
2515:           mat->num_entries = nnz;
2516:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517:           mat->row_offsets->assign(ii, ii + m + 1);

2519:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520:           mat->column_indices->assign(a->j, a->j + nnz);

2522:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2525:           /* assign the pointer */
2526:           matstruct->mat = mat;
2527: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2528:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2529:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2530:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2531:             PetscCallCUSPARSE(stat);
2532:           }
2533: #endif
2534:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2535: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2536:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2537: #else
2538:           CsrMatrix *mat   = new CsrMatrix;
2539:           mat->num_rows    = m;
2540:           mat->num_cols    = A->cmap->n;
2541:           mat->num_entries = nnz;
2542:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2543:           mat->row_offsets->assign(ii, ii + m + 1);

2545:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2546:           mat->column_indices->assign(a->j, a->j + nnz);

2548:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2549:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2551:           cusparseHybMat_t hybMat;
2552:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2553:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2554:           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2555:           PetscCallCUSPARSE(stat);
2556:           /* assign the pointer */
2557:           matstruct->mat = hybMat;

2559:           if (mat) {
2560:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2561:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2562:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2563:             delete (CsrMatrix *)mat;
2564:           }
2565: #endif
2566:         }

2568:         /* assign the compressed row indices */
2569:         if (a->compressedrow.use) {
2570:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2571:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2572:           matstruct->cprowIndices->assign(ridx, ridx + m);
2573:           tmp = m;
2574:         } else {
2575:           cusparsestruct->workVector = NULL;
2576:           matstruct->cprowIndices    = NULL;
2577:           tmp                        = 0;
2578:         }
2579:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2581:         /* assign the pointer */
2582:         cusparsestruct->mat = matstruct;
2583:       } catch (char *ex) {
2584:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2585:       }
2586:       PetscCallCUDA(WaitForCUDA());
2587:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2588:       cusparsestruct->nonzerostate = A->nonzerostate;
2589:     }
2590:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2591:   }
2592:   PetscFunctionReturn(PETSC_SUCCESS);
2593: }

2595: struct VecCUDAPlusEquals {
2596:   template <typename Tuple>
2597:   __host__ __device__ void operator()(Tuple t)
2598:   {
2599:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2600:   }
2601: };

2603: struct VecCUDAEquals {
2604:   template <typename Tuple>
2605:   __host__ __device__ void operator()(Tuple t)
2606:   {
2607:     thrust::get<1>(t) = thrust::get<0>(t);
2608:   }
2609: };

2611: struct VecCUDAEqualsReverse {
2612:   template <typename Tuple>
2613:   __host__ __device__ void operator()(Tuple t)
2614:   {
2615:     thrust::get<0>(t) = thrust::get<1>(t);
2616:   }
2617: };

2619: struct MatMatCusparse {
2620:   PetscBool      cisdense;
2621:   PetscScalar   *Bt;
2622:   Mat            X;
2623:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2624:   PetscLogDouble flops;
2625:   CsrMatrix     *Bcsr;

2627: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2628:   cusparseSpMatDescr_t matSpBDescr;
2629:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2630:   cusparseDnMatDescr_t matBDescr;
2631:   cusparseDnMatDescr_t matCDescr;
2632:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2633:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634:   void *dBuffer4;
2635:   void *dBuffer5;
2636:   #endif
2637:   size_t                mmBufferSize;
2638:   void                 *mmBuffer;
2639:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2640:   cusparseSpGEMMDescr_t spgemmDesc;
2641: #endif
2642: };

2644: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2645: {
2646:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2648:   PetscFunctionBegin;
2649:   PetscCallCUDA(cudaFree(mmdata->Bt));
2650:   delete mmdata->Bcsr;
2651: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2652:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2653:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2654:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2655:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2656:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2657:   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2658:   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2659:   #endif
2660:   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2661:   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2662: #endif
2663:   PetscCall(MatDestroy(&mmdata->X));
2664:   PetscCall(PetscFree(data));
2665:   PetscFunctionReturn(PETSC_SUCCESS);
2666: }

2668: #include <../src/mat/impls/dense/seq/dense.h>

2670: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2671: {
2672:   Mat_Product                  *product = C->product;
2673:   Mat                           A, B;
2674:   PetscInt                      m, n, blda, clda;
2675:   PetscBool                     flg, biscuda;
2676:   Mat_SeqAIJCUSPARSE           *cusp;
2677:   cusparseStatus_t              stat;
2678:   cusparseOperation_t           opA;
2679:   const PetscScalar            *barray;
2680:   PetscScalar                  *carray;
2681:   MatMatCusparse               *mmdata;
2682:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2683:   CsrMatrix                    *csrmat;

2685:   PetscFunctionBegin;
2686:   MatCheckProduct(C, 1);
2687:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2688:   mmdata = (MatMatCusparse *)product->data;
2689:   A      = product->A;
2690:   B      = product->B;
2691:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2692:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2693:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2694:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2695:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2696:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2697:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2698:   switch (product->type) {
2699:   case MATPRODUCT_AB:
2700:   case MATPRODUCT_PtAP:
2701:     mat = cusp->mat;
2702:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2703:     m   = A->rmap->n;
2704:     n   = B->cmap->n;
2705:     break;
2706:   case MATPRODUCT_AtB:
2707:     if (!A->form_explicit_transpose) {
2708:       mat = cusp->mat;
2709:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2710:     } else {
2711:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2712:       mat = cusp->matTranspose;
2713:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2714:     }
2715:     m = A->cmap->n;
2716:     n = B->cmap->n;
2717:     break;
2718:   case MATPRODUCT_ABt:
2719:   case MATPRODUCT_RARt:
2720:     mat = cusp->mat;
2721:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2722:     m   = A->rmap->n;
2723:     n   = B->rmap->n;
2724:     break;
2725:   default:
2726:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2727:   }
2728:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2729:   csrmat = (CsrMatrix *)mat->mat;
2730:   /* if the user passed a CPU matrix, copy the data to the GPU */
2731:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2732:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2733:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2735:   PetscCall(MatDenseGetLDA(B, &blda));
2736:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2737:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2738:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2739:   } else {
2740:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2741:     PetscCall(MatDenseGetLDA(C, &clda));
2742:   }

2744:   PetscCall(PetscLogGpuTimeBegin());
2745: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2746:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2747:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2748:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2749:   #else
2750:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2751:   #endif

2753:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2754:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2755:     size_t mmBufferSize;
2756:     if (mmdata->initialized && mmdata->Blda != blda) {
2757:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2758:       mmdata->matBDescr = NULL;
2759:     }
2760:     if (!mmdata->matBDescr) {
2761:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2762:       mmdata->Blda = blda;
2763:     }

2765:     if (mmdata->initialized && mmdata->Clda != clda) {
2766:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2767:       mmdata->matCDescr = NULL;
2768:     }
2769:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2770:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2771:       mmdata->Clda = clda;
2772:     }

2774:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2775:     if (matADescr) {
2776:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2777:       matADescr = NULL;
2778:     }
2779:   #endif

2781:     if (!matADescr) {
2782:       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2783:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2784:       PetscCallCUSPARSE(stat);
2785:     }

2787:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

2789:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2790:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2791:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2792:       mmdata->mmBufferSize = mmBufferSize;
2793:     }

2795:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2796:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2797:   #endif

2799:     mmdata->initialized = PETSC_TRUE;
2800:   } else {
2801:     /* to be safe, always update pointers of the mats */
2802:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2803:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2804:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2805:   }

2807:   /* do cusparseSpMM, which supports transpose on B */
2808:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2809: #else
2810:   PetscInt k;
2811:   /* cusparseXcsrmm does not support transpose on B */
2812:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2813:     cublasHandle_t cublasv2handle;
2814:     cublasStatus_t cerr;

2816:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2817:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2818:     PetscCallCUBLAS(cerr);
2819:     blda = B->cmap->n;
2820:     k    = B->cmap->n;
2821:   } else {
2822:     k = B->rmap->n;
2823:   }

2825:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2826:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2827:   PetscCallCUSPARSE(stat);
2828: #endif
2829:   PetscCall(PetscLogGpuTimeEnd());
2830:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2831:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2832:   if (product->type == MATPRODUCT_RARt) {
2833:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2834:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2835:   } else if (product->type == MATPRODUCT_PtAP) {
2836:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2837:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2838:   } else {
2839:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2840:   }
2841:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2842:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2843:   PetscFunctionReturn(PETSC_SUCCESS);
2844: }

2846: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2847: {
2848:   Mat_Product        *product = C->product;
2849:   Mat                 A, B;
2850:   PetscInt            m, n;
2851:   PetscBool           cisdense, flg;
2852:   MatMatCusparse     *mmdata;
2853:   Mat_SeqAIJCUSPARSE *cusp;

2855:   PetscFunctionBegin;
2856:   MatCheckProduct(C, 1);
2857:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2858:   A = product->A;
2859:   B = product->B;
2860:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2861:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2862:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2863:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2864:   switch (product->type) {
2865:   case MATPRODUCT_AB:
2866:     m = A->rmap->n;
2867:     n = B->cmap->n;
2868:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2869:     break;
2870:   case MATPRODUCT_AtB:
2871:     m = A->cmap->n;
2872:     n = B->cmap->n;
2873:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2874:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2875:     break;
2876:   case MATPRODUCT_ABt:
2877:     m = A->rmap->n;
2878:     n = B->rmap->n;
2879:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2880:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2881:     break;
2882:   case MATPRODUCT_PtAP:
2883:     m = B->cmap->n;
2884:     n = B->cmap->n;
2885:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2886:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2887:     break;
2888:   case MATPRODUCT_RARt:
2889:     m = B->rmap->n;
2890:     n = B->rmap->n;
2891:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2892:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2893:     break;
2894:   default:
2895:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2896:   }
2897:   PetscCall(MatSetSizes(C, m, n, m, n));
2898:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2899:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2900:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2902:   /* product data */
2903:   PetscCall(PetscNew(&mmdata));
2904:   mmdata->cisdense = cisdense;
2905: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2906:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2907:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2908: #endif
2909:   /* for these products we need intermediate storage */
2910:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2911:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2912:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2913:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2914:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2915:     } else {
2916:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2917:     }
2918:   }
2919:   C->product->data    = mmdata;
2920:   C->product->destroy = MatDestroy_MatMatCusparse;

2922:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2923:   PetscFunctionReturn(PETSC_SUCCESS);
2924: }

2926: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2927: {
2928:   Mat_Product                  *product = C->product;
2929:   Mat                           A, B;
2930:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2931:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2932:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2933:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2934:   PetscBool                     flg;
2935:   cusparseStatus_t              stat;
2936:   MatProductType                ptype;
2937:   MatMatCusparse               *mmdata;
2938: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2939:   cusparseSpMatDescr_t BmatSpDescr;
2940: #endif
2941:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2943:   PetscFunctionBegin;
2944:   MatCheckProduct(C, 1);
2945:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2946:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2947:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2948:   mmdata = (MatMatCusparse *)C->product->data;
2949:   A      = product->A;
2950:   B      = product->B;
2951:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2952:     mmdata->reusesym = PETSC_FALSE;
2953:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2954:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2955:     Cmat = Ccusp->mat;
2956:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2957:     Ccsr = (CsrMatrix *)Cmat->mat;
2958:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2959:     goto finalize;
2960:   }
2961:   if (!c->nz) goto finalize;
2962:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2963:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2964:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2965:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2966:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2968:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2969:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2970:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2971:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2974:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2975:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2977:   ptype = product->type;
2978:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2979:     ptype = MATPRODUCT_AB;
2980:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2981:   }
2982:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2983:     ptype = MATPRODUCT_AB;
2984:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2985:   }
2986:   switch (ptype) {
2987:   case MATPRODUCT_AB:
2988:     Amat = Acusp->mat;
2989:     Bmat = Bcusp->mat;
2990:     break;
2991:   case MATPRODUCT_AtB:
2992:     Amat = Acusp->matTranspose;
2993:     Bmat = Bcusp->mat;
2994:     break;
2995:   case MATPRODUCT_ABt:
2996:     Amat = Acusp->mat;
2997:     Bmat = Bcusp->matTranspose;
2998:     break;
2999:   default:
3000:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3001:   }
3002:   Cmat = Ccusp->mat;
3003:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3004:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3005:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3006:   Acsr = (CsrMatrix *)Amat->mat;
3007:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3008:   Ccsr = (CsrMatrix *)Cmat->mat;
3009:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3010:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3011:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3012:   PetscCall(PetscLogGpuTimeBegin());
3013: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3014:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3015:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3016:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3017:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3018:   PetscCallCUSPARSE(stat);
3019:   #else
3020:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3021:   PetscCallCUSPARSE(stat);
3022:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3023:   PetscCallCUSPARSE(stat);
3024:   #endif
3025: #else
3026:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3027:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3028:   PetscCallCUSPARSE(stat);
3029: #endif
3030:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3031:   PetscCallCUDA(WaitForCUDA());
3032:   PetscCall(PetscLogGpuTimeEnd());
3033:   C->offloadmask = PETSC_OFFLOAD_GPU;
3034: finalize:
3035:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3036:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3037:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3038:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3039:   c->reallocs = 0;
3040:   C->info.mallocs += 0;
3041:   C->info.nz_unneeded = 0;
3042:   C->assembled = C->was_assembled = PETSC_TRUE;
3043:   C->num_ass++;
3044:   PetscFunctionReturn(PETSC_SUCCESS);
3045: }

3047: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3048: {
3049:   Mat_Product                  *product = C->product;
3050:   Mat                           A, B;
3051:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3052:   Mat_SeqAIJ                   *a, *b, *c;
3053:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3054:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3055:   PetscInt                      i, j, m, n, k;
3056:   PetscBool                     flg;
3057:   cusparseStatus_t              stat;
3058:   MatProductType                ptype;
3059:   MatMatCusparse               *mmdata;
3060:   PetscLogDouble                flops;
3061:   PetscBool                     biscompressed, ciscompressed;
3062: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3063:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3064:   cusparseSpMatDescr_t BmatSpDescr;
3065: #else
3066:   int cnz;
3067: #endif
3068:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

3070:   PetscFunctionBegin;
3071:   MatCheckProduct(C, 1);
3072:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3073:   A = product->A;
3074:   B = product->B;
3075:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3076:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3077:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3078:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3079:   a = (Mat_SeqAIJ *)A->data;
3080:   b = (Mat_SeqAIJ *)B->data;
3081:   /* product data */
3082:   PetscCall(PetscNew(&mmdata));
3083:   C->product->data    = mmdata;
3084:   C->product->destroy = MatDestroy_MatMatCusparse;

3086:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3087:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3088:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3089:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3090:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3091:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

3093:   ptype = product->type;
3094:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3095:     ptype                                          = MATPRODUCT_AB;
3096:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3097:   }
3098:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3099:     ptype                                          = MATPRODUCT_AB;
3100:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3101:   }
3102:   biscompressed = PETSC_FALSE;
3103:   ciscompressed = PETSC_FALSE;
3104:   switch (ptype) {
3105:   case MATPRODUCT_AB:
3106:     m    = A->rmap->n;
3107:     n    = B->cmap->n;
3108:     k    = A->cmap->n;
3109:     Amat = Acusp->mat;
3110:     Bmat = Bcusp->mat;
3111:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3112:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3113:     break;
3114:   case MATPRODUCT_AtB:
3115:     m = A->cmap->n;
3116:     n = B->cmap->n;
3117:     k = A->rmap->n;
3118:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3119:     Amat = Acusp->matTranspose;
3120:     Bmat = Bcusp->mat;
3121:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3122:     break;
3123:   case MATPRODUCT_ABt:
3124:     m = A->rmap->n;
3125:     n = B->rmap->n;
3126:     k = A->cmap->n;
3127:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3128:     Amat = Acusp->mat;
3129:     Bmat = Bcusp->matTranspose;
3130:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3131:     break;
3132:   default:
3133:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3134:   }

3136:   /* create cusparse matrix */
3137:   PetscCall(MatSetSizes(C, m, n, m, n));
3138:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3139:   c     = (Mat_SeqAIJ *)C->data;
3140:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3141:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3142:   Ccsr  = new CsrMatrix;

3144:   c->compressedrow.use = ciscompressed;
3145:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3146:     c->compressedrow.nrows = a->compressedrow.nrows;
3147:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3148:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3149:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3150:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3151:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3152:   } else {
3153:     c->compressedrow.nrows  = 0;
3154:     c->compressedrow.i      = NULL;
3155:     c->compressedrow.rindex = NULL;
3156:     Ccusp->workVector       = NULL;
3157:     Cmat->cprowIndices      = NULL;
3158:   }
3159:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3160:   Ccusp->mat        = Cmat;
3161:   Ccusp->mat->mat   = Ccsr;
3162:   Ccsr->num_rows    = Ccusp->nrows;
3163:   Ccsr->num_cols    = n;
3164:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3165:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3166:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3167:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3168:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3169:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3170:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3171:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3174:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3175:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3176:     c->nz                = 0;
3177:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3178:     Ccsr->values         = new THRUSTARRAY(c->nz);
3179:     goto finalizesym;
3180:   }

3182:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3183:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3184:   Acsr = (CsrMatrix *)Amat->mat;
3185:   if (!biscompressed) {
3186:     Bcsr = (CsrMatrix *)Bmat->mat;
3187: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3188:     BmatSpDescr = Bmat->matDescr;
3189: #endif
3190:   } else { /* we need to use row offsets for the full matrix */
3191:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3192:     Bcsr                 = new CsrMatrix;
3193:     Bcsr->num_rows       = B->rmap->n;
3194:     Bcsr->num_cols       = cBcsr->num_cols;
3195:     Bcsr->num_entries    = cBcsr->num_entries;
3196:     Bcsr->column_indices = cBcsr->column_indices;
3197:     Bcsr->values         = cBcsr->values;
3198:     if (!Bcusp->rowoffsets_gpu) {
3199:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3200:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3201:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3202:     }
3203:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3204:     mmdata->Bcsr      = Bcsr;
3205: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3206:     if (Bcsr->num_rows && Bcsr->num_cols) {
3207:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3208:       PetscCallCUSPARSE(stat);
3209:     }
3210:     BmatSpDescr = mmdata->matSpBDescr;
3211: #endif
3212:   }
3213:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3214:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3215:   /* precompute flops count */
3216:   if (ptype == MATPRODUCT_AB) {
3217:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3218:       const PetscInt st = a->i[i];
3219:       const PetscInt en = a->i[i + 1];
3220:       for (j = st; j < en; j++) {
3221:         const PetscInt brow = a->j[j];
3222:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3223:       }
3224:     }
3225:   } else if (ptype == MATPRODUCT_AtB) {
3226:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3227:       const PetscInt anzi = a->i[i + 1] - a->i[i];
3228:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3229:       flops += (2. * anzi) * bnzi;
3230:     }
3231:   } else { /* TODO */
3232:     flops = 0.;
3233:   }

3235:   mmdata->flops = flops;
3236:   PetscCall(PetscLogGpuTimeBegin());

3238: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3239:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3240:   // cuda-12.2 requires non-null csrRowOffsets
3241:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3242:   PetscCallCUSPARSE(stat);
3243:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3244:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3245:   {
3246:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3247:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3248:   */
3249:     void *dBuffer1 = NULL;
3250:     void *dBuffer2 = NULL;
3251:     void *dBuffer3 = NULL;
3252:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3253:     size_t bufferSize1 = 0;
3254:     size_t bufferSize2 = 0;
3255:     size_t bufferSize3 = 0;
3256:     size_t bufferSize4 = 0;
3257:     size_t bufferSize5 = 0;

3259:     /* ask bufferSize1 bytes for external memory */
3260:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3261:     PetscCallCUSPARSE(stat);
3262:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3263:     /* inspect the matrices A and B to understand the memory requirement for the next step */
3264:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3265:     PetscCallCUSPARSE(stat);

3267:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3268:     PetscCallCUSPARSE(stat);
3269:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3270:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3271:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3272:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3273:     PetscCallCUSPARSE(stat);
3274:     PetscCallCUDA(cudaFree(dBuffer1));
3275:     PetscCallCUDA(cudaFree(dBuffer2));

3277:     /* get matrix C non-zero entries C_nnz1 */
3278:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3279:     c->nz = (PetscInt)C_nnz1;
3280:     /* allocate matrix C */
3281:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3282:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283:     Ccsr->values = new THRUSTARRAY(c->nz);
3284:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285:     /* update matC with the new pointers */
3286:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3287:     PetscCallCUSPARSE(stat);

3289:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3290:     PetscCallCUSPARSE(stat);
3291:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3292:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3293:     PetscCallCUSPARSE(stat);
3294:     PetscCallCUDA(cudaFree(dBuffer3));
3295:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3296:     PetscCallCUSPARSE(stat);
3297:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3298:   }
3299:   #else
3300:   size_t bufSize2;
3301:   /* ask bufferSize bytes for external memory */
3302:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3303:   PetscCallCUSPARSE(stat);
3304:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3305:   /* inspect the matrices A and B to understand the memory requirement for the next step */
3306:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3307:   PetscCallCUSPARSE(stat);
3308:   /* ask bufferSize again bytes for external memory */
3309:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3310:   PetscCallCUSPARSE(stat);
3311:   /* The CUSPARSE documentation is not clear, nor the API
3312:      We need both buffers to perform the operations properly!
3313:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3314:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3315:      is stored in the descriptor! What a messy API... */
3316:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3317:   /* compute the intermediate product of A * B */
3318:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3319:   PetscCallCUSPARSE(stat);
3320:   /* get matrix C non-zero entries C_nnz1 */
3321:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3322:   c->nz = (PetscInt)C_nnz1;
3323:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3324:                       mmdata->mmBufferSize / 1024));
3325:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3326:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327:   Ccsr->values = new THRUSTARRAY(c->nz);
3328:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3329:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3330:   PetscCallCUSPARSE(stat);
3331:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3332:   PetscCallCUSPARSE(stat);
3333:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3334: #else
3335:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3336:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3337:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3338:   PetscCallCUSPARSE(stat);
3339:   c->nz                = cnz;
3340:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3341:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342:   Ccsr->values = new THRUSTARRAY(c->nz);
3343:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

3345:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3346:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3347:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3348:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3349:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3350:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3351:   PetscCallCUSPARSE(stat);
3352: #endif
3353:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3354:   PetscCall(PetscLogGpuTimeEnd());
3355: finalizesym:
3356:   c->free_a = PETSC_TRUE;
3357:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3358:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3359:   c->free_ij = PETSC_TRUE;
3360:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3361:     PetscInt      *d_i = c->i;
3362:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3363:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3364:     ii = *Ccsr->row_offsets;
3365:     jj = *Ccsr->column_indices;
3366:     if (ciscompressed) d_i = c->compressedrow.i;
3367:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3369:   } else {
3370:     PetscInt *d_i = c->i;
3371:     if (ciscompressed) d_i = c->compressedrow.i;
3372:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3374:   }
3375:   if (ciscompressed) { /* need to expand host row offsets */
3376:     PetscInt r = 0;
3377:     c->i[0]    = 0;
3378:     for (k = 0; k < c->compressedrow.nrows; k++) {
3379:       const PetscInt next = c->compressedrow.rindex[k];
3380:       const PetscInt old  = c->compressedrow.i[k];
3381:       for (; r < next; r++) c->i[r + 1] = old;
3382:     }
3383:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3384:   }
3385:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3386:   PetscCall(PetscMalloc1(m, &c->ilen));
3387:   PetscCall(PetscMalloc1(m, &c->imax));
3388:   c->maxnz         = c->nz;
3389:   c->nonzerorowcnt = 0;
3390:   c->rmax          = 0;
3391:   for (k = 0; k < m; k++) {
3392:     const PetscInt nn = c->i[k + 1] - c->i[k];
3393:     c->ilen[k] = c->imax[k] = nn;
3394:     c->nonzerorowcnt += (PetscInt)!!nn;
3395:     c->rmax = PetscMax(c->rmax, nn);
3396:   }
3397:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3398:   PetscCall(PetscMalloc1(c->nz, &c->a));
3399:   Ccsr->num_entries = c->nz;

3401:   C->nonzerostate++;
3402:   PetscCall(PetscLayoutSetUp(C->rmap));
3403:   PetscCall(PetscLayoutSetUp(C->cmap));
3404:   Ccusp->nonzerostate = C->nonzerostate;
3405:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3406:   C->preallocated     = PETSC_TRUE;
3407:   C->assembled        = PETSC_FALSE;
3408:   C->was_assembled    = PETSC_FALSE;
3409:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3410:     mmdata->reusesym = PETSC_TRUE;
3411:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3412:   }
3413:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3414:   PetscFunctionReturn(PETSC_SUCCESS);
3415: }

3417: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3419: /* handles sparse or dense B */
3420: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3421: {
3422:   Mat_Product *product = mat->product;
3423:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3425:   PetscFunctionBegin;
3426:   MatCheckProduct(mat, 1);
3427:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3428:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3429:   if (product->type == MATPRODUCT_ABC) {
3430:     Ciscusp = PETSC_FALSE;
3431:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3432:   }
3433:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3434:     PetscBool usecpu = PETSC_FALSE;
3435:     switch (product->type) {
3436:     case MATPRODUCT_AB:
3437:       if (product->api_user) {
3438:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3439:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3440:         PetscOptionsEnd();
3441:       } else {
3442:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3443:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3444:         PetscOptionsEnd();
3445:       }
3446:       break;
3447:     case MATPRODUCT_AtB:
3448:       if (product->api_user) {
3449:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3450:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3451:         PetscOptionsEnd();
3452:       } else {
3453:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3454:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3455:         PetscOptionsEnd();
3456:       }
3457:       break;
3458:     case MATPRODUCT_PtAP:
3459:       if (product->api_user) {
3460:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3461:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3462:         PetscOptionsEnd();
3463:       } else {
3464:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3465:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3466:         PetscOptionsEnd();
3467:       }
3468:       break;
3469:     case MATPRODUCT_RARt:
3470:       if (product->api_user) {
3471:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3472:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3473:         PetscOptionsEnd();
3474:       } else {
3475:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3476:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3477:         PetscOptionsEnd();
3478:       }
3479:       break;
3480:     case MATPRODUCT_ABC:
3481:       if (product->api_user) {
3482:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3483:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3484:         PetscOptionsEnd();
3485:       } else {
3486:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3487:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3488:         PetscOptionsEnd();
3489:       }
3490:       break;
3491:     default:
3492:       break;
3493:     }
3494:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3495:   }
3496:   /* dispatch */
3497:   if (isdense) {
3498:     switch (product->type) {
3499:     case MATPRODUCT_AB:
3500:     case MATPRODUCT_AtB:
3501:     case MATPRODUCT_ABt:
3502:     case MATPRODUCT_PtAP:
3503:     case MATPRODUCT_RARt:
3504:       if (product->A->boundtocpu) {
3505:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3506:       } else {
3507:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3508:       }
3509:       break;
3510:     case MATPRODUCT_ABC:
3511:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3512:       break;
3513:     default:
3514:       break;
3515:     }
3516:   } else if (Biscusp && Ciscusp) {
3517:     switch (product->type) {
3518:     case MATPRODUCT_AB:
3519:     case MATPRODUCT_AtB:
3520:     case MATPRODUCT_ABt:
3521:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3522:       break;
3523:     case MATPRODUCT_PtAP:
3524:     case MATPRODUCT_RARt:
3525:     case MATPRODUCT_ABC:
3526:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3527:       break;
3528:     default:
3529:       break;
3530:     }
3531:   } else { /* fallback for AIJ */
3532:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3533:   }
3534:   PetscFunctionReturn(PETSC_SUCCESS);
3535: }

3537: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538: {
3539:   PetscFunctionBegin;
3540:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3541:   PetscFunctionReturn(PETSC_SUCCESS);
3542: }

3544: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3545: {
3546:   PetscFunctionBegin;
3547:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3548:   PetscFunctionReturn(PETSC_SUCCESS);
3549: }

3551: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3552: {
3553:   PetscFunctionBegin;
3554:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3555:   PetscFunctionReturn(PETSC_SUCCESS);
3556: }

3558: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3559: {
3560:   PetscFunctionBegin;
3561:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3562:   PetscFunctionReturn(PETSC_SUCCESS);
3563: }

3565: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3566: {
3567:   PetscFunctionBegin;
3568:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3569:   PetscFunctionReturn(PETSC_SUCCESS);
3570: }

3572: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3573: {
3574:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3575:   if (i < n) y[idx[i]] += x[i];
3576: }

3578: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3579: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3580: {
3581:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3582:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3583:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3584:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3585:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3586:   PetscBool                     compressed;
3587: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3588:   PetscInt nx, ny;
3589: #endif

3591:   PetscFunctionBegin;
3592:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3593:   if (!a->nz) {
3594:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3595:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3596:     PetscFunctionReturn(PETSC_SUCCESS);
3597:   }
3598:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3599:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3600:   if (!trans) {
3601:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3602:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3603:   } else {
3604:     if (herm || !A->form_explicit_transpose) {
3605:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3606:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3607:     } else {
3608:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3609:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3610:     }
3611:   }
3612:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3613:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3615:   try {
3616:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3617:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3618:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3620:     PetscCall(PetscLogGpuTimeBegin());
3621:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3622:       /* z = A x + beta y.
3623:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3624:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3625:       */
3626:       xptr = xarray;
3627:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3628:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3631:           allocated to accommodate different uses. So we get the length info directly from mat.
3632:        */
3633:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635:         nx             = mat->num_cols; // since y = Ax
3636:         ny             = mat->num_rows;
3637:       }
3638: #endif
3639:     } else {
3640:       /* z = A^T x + beta y
3641:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3642:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3643:        */
3644:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3645:       dptr = zarray;
3646:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3647:       if (compressed) { /* Scatter x to work vector */
3648:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3650:         thrust::for_each(
3651: #if PetscDefined(HAVE_THRUST_ASYNC)
3652:           thrust::cuda::par.on(PetscDefaultCudaStream),
3653: #endif
3654:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3655:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3656:       }
3657: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3658:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3659:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3660:         nx             = mat->num_rows; // since y = A^T x
3661:         ny             = mat->num_cols;
3662:       }
3663: #endif
3664:     }

3666:     /* csr_spmv does y = alpha op(A) x + beta y */
3667:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3668: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3670:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3671:   #else
3672:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3673:   #endif

3675:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3676:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3677:       if (!matDescr) {
3678:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3680:       }
3681:   #endif

3683:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3684:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3685:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3686:         PetscCallCUSPARSE(
3687:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3688:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3689:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3690:         PetscCallCUSPARSE(
3691:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3692:   #endif
3693:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3694:       } else {
3695:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3696:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3697:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3698:       }

3700:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3701: #else
3702:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3703:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3704: #endif
3705:     } else {
3706:       if (cusparsestruct->nrows) {
3707: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3708:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3709: #else
3710:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3711:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3712: #endif
3713:       }
3714:     }
3715:     PetscCall(PetscLogGpuTimeEnd());

3717:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3718:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3719:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3720:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3721:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3722:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3723:         }
3724:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3725:         PetscCall(VecSeq_CUDA::Set(zz, 0));
3726:       }

3728:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3729:       if (compressed) {
3730:         PetscCall(PetscLogGpuTimeBegin());
3731:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3732:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3733:         PetscCall(PetscLogGpuTimeEnd());
3734:       }
3735:     } else {
3736:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3737:     }
3738:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3739:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3740:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3741:   } catch (char *ex) {
3742:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3743:   }
3744:   if (yy) {
3745:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3746:   } else {
3747:     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3748:   }
3749:   PetscFunctionReturn(PETSC_SUCCESS);
3750: }

3752: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3753: {
3754:   PetscFunctionBegin;
3755:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3756:   PetscFunctionReturn(PETSC_SUCCESS);
3757: }

3759: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3760: {
3761:   PetscFunctionBegin;
3762:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3763:   PetscFunctionReturn(PETSC_SUCCESS);
3764: }

3766: /*@
3767:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3768:   (the default parallel PETSc format).

3770:   Collective

3772:   Input Parameters:
3773: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3774: . m    - number of rows
3775: . n    - number of columns
3776: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3777: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3779:   Output Parameter:
3780: . A - the matrix

3782:   Level: intermediate

3784:   Notes:
3785:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3786:   calculations. For good matrix assembly performance the user should preallocate the matrix
3787:   storage by setting the parameter `nz` (or the array `nnz`).

3789:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3790:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3791:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3793:   The AIJ format, also called
3794:   compressed row storage, is fully compatible with standard Fortran
3795:   storage.  That is, the stored row and column indices can begin at
3796:   either one (as in Fortran) or zero.

3798:   Specify the preallocated storage with either nz or nnz (not both).
3799:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3800:   allocation.

3802: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3803: @*/
3804: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3805: {
3806:   PetscFunctionBegin;
3807:   PetscCall(MatCreate(comm, A));
3808:   PetscCall(MatSetSizes(*A, m, n, m, n));
3809:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3810:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3811:   PetscFunctionReturn(PETSC_SUCCESS);
3812: }

3814: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3815: {
3816:   PetscFunctionBegin;
3817:   if (A->factortype == MAT_FACTOR_NONE) {
3818:     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3819:   } else {
3820:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3821:   }
3822:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3823:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3824:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3825:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3826:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3827:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3828:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3829:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3830:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3831:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3832:   PetscCall(MatDestroy_SeqAIJ(A));
3833:   PetscFunctionReturn(PETSC_SUCCESS);
3834: }

3836: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3837: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3838: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3839: {
3840:   PetscFunctionBegin;
3841:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3842:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3843:   PetscFunctionReturn(PETSC_SUCCESS);
3844: }

3846: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3847: {
3848:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3849:   Mat_SeqAIJCUSPARSE *cy;
3850:   Mat_SeqAIJCUSPARSE *cx;
3851:   PetscScalar        *ay;
3852:   const PetscScalar  *ax;
3853:   CsrMatrix          *csry, *csrx;

3855:   PetscFunctionBegin;
3856:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3857:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3858:   if (X->ops->axpy != Y->ops->axpy) {
3859:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3860:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3861:     PetscFunctionReturn(PETSC_SUCCESS);
3862:   }
3863:   /* if we are here, it means both matrices are bound to GPU */
3864:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3865:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3866:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3868:   csry = (CsrMatrix *)cy->mat->mat;
3869:   csrx = (CsrMatrix *)cx->mat->mat;
3870:   /* see if we can turn this into a cublas axpy */
3871:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3872:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3873:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3874:     if (eq) str = SAME_NONZERO_PATTERN;
3875:   }
3876:   /* spgeam is buggy with one column */
3877:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3879:   if (str == SUBSET_NONZERO_PATTERN) {
3880:     PetscScalar b = 1.0;
3881: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3882:     size_t bufferSize;
3883:     void  *buffer;
3884: #endif

3886:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3887:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3888:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3889: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3890:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3891:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3892:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3893:     PetscCall(PetscLogGpuTimeBegin());
3894:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3895:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3896:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3897:     PetscCall(PetscLogGpuTimeEnd());
3898:     PetscCallCUDA(cudaFree(buffer));
3899: #else
3900:     PetscCall(PetscLogGpuTimeBegin());
3901:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3902:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3903:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3904:     PetscCall(PetscLogGpuTimeEnd());
3905: #endif
3906:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3907:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3908:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3909:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3910:   } else if (str == SAME_NONZERO_PATTERN) {
3911:     cublasHandle_t cublasv2handle;
3912:     PetscBLASInt   one = 1, bnz = 1;

3914:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3915:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3916:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3917:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3918:     PetscCall(PetscLogGpuTimeBegin());
3919:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3920:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3921:     PetscCall(PetscLogGpuTimeEnd());
3922:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3923:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3924:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3925:   } else {
3926:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3927:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3928:   }
3929:   PetscFunctionReturn(PETSC_SUCCESS);
3930: }

3932: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3933: {
3934:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3935:   PetscScalar   *ay;
3936:   cublasHandle_t cublasv2handle;
3937:   PetscBLASInt   one = 1, bnz = 1;

3939:   PetscFunctionBegin;
3940:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3941:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3942:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3943:   PetscCall(PetscLogGpuTimeBegin());
3944:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3945:   PetscCall(PetscLogGpuFlops(bnz));
3946:   PetscCall(PetscLogGpuTimeEnd());
3947:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3948:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3949:   PetscFunctionReturn(PETSC_SUCCESS);
3950: }

3952: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3953: {
3954:   PetscBool   both = PETSC_FALSE;
3955:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3957:   PetscFunctionBegin;
3958:   if (A->factortype == MAT_FACTOR_NONE) {
3959:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3960:     if (spptr->mat) {
3961:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3962:       if (matrix->values) {
3963:         both = PETSC_TRUE;
3964:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3965:       }
3966:     }
3967:     if (spptr->matTranspose) {
3968:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3969:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3970:     }
3971:   }
3972:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3973:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3974:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3975:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3976:   PetscFunctionReturn(PETSC_SUCCESS);
3977: }

3979: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3980: {
3981:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3983:   PetscFunctionBegin;
3984:   if (A->factortype != MAT_FACTOR_NONE) {
3985:     A->boundtocpu = flg;
3986:     PetscFunctionReturn(PETSC_SUCCESS);
3987:   }
3988:   if (flg) {
3989:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

3991:     A->ops->scale                     = MatScale_SeqAIJ;
3992:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3993:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3994:     A->ops->mult                      = MatMult_SeqAIJ;
3995:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3996:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3997:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3998:     A->ops->multhermitiantranspose    = NULL;
3999:     A->ops->multhermitiantransposeadd = NULL;
4000:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4001:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4002:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4003:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4004:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4005:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4006:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4007:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4008:   } else {
4009:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4010:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4011:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4012:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4013:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4014:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4015:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4016:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4017:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4018:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4019:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4020:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4021:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4022:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4023:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4024:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4025:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

4027:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4028:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4029:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4030:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4031:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4032:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4033:   }
4034:   A->boundtocpu = flg;
4035:   if (flg && a->inode.size_csr) {
4036:     a->inode.use = PETSC_TRUE;
4037:   } else {
4038:     a->inode.use = PETSC_FALSE;
4039:   }
4040:   PetscFunctionReturn(PETSC_SUCCESS);
4041: }

4043: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4044: {
4045:   Mat B;

4047:   PetscFunctionBegin;
4048:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4049:   if (reuse == MAT_INITIAL_MATRIX) {
4050:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4051:   } else if (reuse == MAT_REUSE_MATRIX) {
4052:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4053:   }
4054:   B = *newmat;

4056:   PetscCall(PetscFree(B->defaultvectype));
4057:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

4059:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4060:     if (B->factortype == MAT_FACTOR_NONE) {
4061:       Mat_SeqAIJCUSPARSE *spptr;
4062:       PetscCall(PetscNew(&spptr));
4063:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4064:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4065:       spptr->format = MAT_CUSPARSE_CSR;
4066: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4067:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4068:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4069:   #else
4070:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4071:   #endif
4072:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4073:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4074: #endif
4075:       B->spptr = spptr;
4076:     } else {
4077:       Mat_SeqAIJCUSPARSETriFactors *spptr;

4079:       PetscCall(PetscNew(&spptr));
4080:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4081:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4082:       B->spptr = spptr;
4083:     }
4084:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4085:   }
4086:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4087:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4088:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4089:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4090:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4091:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

4093:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4094:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4095:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4096: #if defined(PETSC_HAVE_HYPRE)
4097:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4098: #endif
4099:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4100:   PetscFunctionReturn(PETSC_SUCCESS);
4101: }

4103: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4104: {
4105:   PetscFunctionBegin;
4106:   PetscCall(MatCreate_SeqAIJ(B));
4107:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4108:   PetscFunctionReturn(PETSC_SUCCESS);
4109: }

4111: /*MC
4112:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

4114:    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4115:    CSR, ELL, or Hybrid format.
4116:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

4118:    Options Database Keys:
4119: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4120: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4121:                                       Other options include ell (ellpack) or hyb (hybrid).
4122: .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4123: -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU

4125:   Level: beginner

4127: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4128: M*/

4130: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4131: {
4132:   PetscFunctionBegin;
4133:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4134:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4135:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4136:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4137:   PetscFunctionReturn(PETSC_SUCCESS);
4138: }

4140: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4141: {
4142:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

4144:   PetscFunctionBegin;
4145:   if (cusp) {
4146:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4147:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4148:     delete cusp->workVector;
4149:     delete cusp->rowoffsets_gpu;
4150:     delete cusp->csr2csc_i;
4151:     delete cusp->coords;
4152:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4153:     PetscCall(PetscFree(mat->spptr));
4154:   }
4155:   PetscFunctionReturn(PETSC_SUCCESS);
4156: }

4158: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4159: {
4160:   PetscFunctionBegin;
4161:   if (*mat) {
4162:     delete (*mat)->values;
4163:     delete (*mat)->column_indices;
4164:     delete (*mat)->row_offsets;
4165:     delete *mat;
4166:     *mat = 0;
4167:   }
4168:   PetscFunctionReturn(PETSC_SUCCESS);
4169: }

4171: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4172: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4173: {
4174:   PetscFunctionBegin;
4175:   if (*trifactor) {
4176:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4177:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4178:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4179:     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4180:     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4181:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4182:     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4183:   #endif
4184:     PetscCall(PetscFree(*trifactor));
4185:   }
4186:   PetscFunctionReturn(PETSC_SUCCESS);
4187: }
4188: #endif

4190: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4191: {
4192:   CsrMatrix *mat;

4194:   PetscFunctionBegin;
4195:   if (*matstruct) {
4196:     if ((*matstruct)->mat) {
4197:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4198: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4199:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4200: #else
4201:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4202:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4203: #endif
4204:       } else {
4205:         mat = (CsrMatrix *)(*matstruct)->mat;
4206:         PetscCall(CsrMatrix_Destroy(&mat));
4207:       }
4208:     }
4209:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4210:     delete (*matstruct)->cprowIndices;
4211:     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4212:     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4213:     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));

4215: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4217:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

4219:     for (int i = 0; i < 3; i++) {
4220:       if (mdata->cuSpMV[i].initialized) {
4221:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4222:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4223:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4224:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4225:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4226:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4227:   #endif
4228:       }
4229:     }
4230: #endif
4231:     delete *matstruct;
4232:     *matstruct = NULL;
4233:   }
4234:   PetscFunctionReturn(PETSC_SUCCESS);
4235: }

4237: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4238: {
4239:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

4241:   PetscFunctionBegin;
4242:   if (fs) {
4243: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4244:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4245:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4246:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4247:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4248:     delete fs->workVector;
4249:     fs->workVector = NULL;
4250: #endif
4251:     delete fs->rpermIndices;
4252:     delete fs->cpermIndices;
4253:     fs->rpermIndices  = NULL;
4254:     fs->cpermIndices  = NULL;
4255:     fs->init_dev_prop = PETSC_FALSE;
4256: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4257:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4258:     PetscCallCUDA(cudaFree(fs->csrColIdx));
4259:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4260:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4261:     PetscCallCUDA(cudaFree(fs->csrVal));
4262:     PetscCallCUDA(cudaFree(fs->diag));
4263:     PetscCallCUDA(cudaFree(fs->X));
4264:     PetscCallCUDA(cudaFree(fs->Y));
4265:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4266:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4267:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4268:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4269:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4270:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4271:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4272:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4273:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4274:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4275:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4276:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4277:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4278:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4279:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4280:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4281:     PetscCall(PetscFree(fs->csrRowPtr_h));
4282:     PetscCall(PetscFree(fs->csrVal_h));
4283:     PetscCall(PetscFree(fs->diag_h));
4284:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4285:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4286: #endif
4287:   }
4288:   PetscFunctionReturn(PETSC_SUCCESS);
4289: }

4291: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4292: {
4293:   PetscFunctionBegin;
4294:   if (*trifactors) {
4295:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4296:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4297:     PetscCall(PetscFree(*trifactors));
4298:   }
4299:   PetscFunctionReturn(PETSC_SUCCESS);
4300: }

4302: struct IJCompare {
4303:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4304:   {
4305:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4306:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4307:     return false;
4308:   }
4309: };

4311: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4312: {
4313:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4315:   PetscFunctionBegin;
4316:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4317:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4318:   if (destroy) {
4319:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4320:     delete cusp->csr2csc_i;
4321:     cusp->csr2csc_i = NULL;
4322:   }
4323:   A->transupdated = PETSC_FALSE;
4324:   PetscFunctionReturn(PETSC_SUCCESS);
4325: }

4327: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4328: {
4329:   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;

4331:   PetscFunctionBegin;
4332:   PetscCallCUDA(cudaFree(coo->perm));
4333:   PetscCallCUDA(cudaFree(coo->jmap));
4334:   PetscCall(PetscFree(coo));
4335:   PetscFunctionReturn(PETSC_SUCCESS);
4336: }

4338: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4339: {
4340:   PetscBool            dev_ij = PETSC_FALSE;
4341:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4342:   PetscInt            *i, *j;
4343:   PetscContainer       container_h;
4344:   MatCOOStruct_SeqAIJ *coo_h, *coo_d;

4346:   PetscFunctionBegin;
4347:   PetscCall(PetscGetMemType(coo_i, &mtype));
4348:   if (PetscMemTypeDevice(mtype)) {
4349:     dev_ij = PETSC_TRUE;
4350:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4351:     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4352:     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4353:   } else {
4354:     i = coo_i;
4355:     j = coo_j;
4356:   }

4358:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4359:   if (dev_ij) PetscCall(PetscFree2(i, j));
4360:   mat->offloadmask = PETSC_OFFLOAD_CPU;
4361:   // Create the GPU memory
4362:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));

4364:   // Copy the COO struct to device
4365:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4366:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4367:   PetscCall(PetscMalloc1(1, &coo_d));
4368:   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4369:   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4370:   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4371:   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4372:   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));

4374:   // Put the COO struct in a container and then attach that to the matrix
4375:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4376:   PetscFunctionReturn(PETSC_SUCCESS);
4377: }

4379: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4380: {
4381:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4382:   const PetscCount grid_size = gridDim.x * blockDim.x;
4383:   for (; i < nnz; i += grid_size) {
4384:     PetscScalar sum = 0.0;
4385:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4386:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4387:   }
4388: }

4390: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4391: {
4392:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4393:   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4394:   PetscCount           Annz = seq->nz;
4395:   PetscMemType         memtype;
4396:   const PetscScalar   *v1 = v;
4397:   PetscScalar         *Aa;
4398:   PetscContainer       container;
4399:   MatCOOStruct_SeqAIJ *coo;

4401:   PetscFunctionBegin;
4402:   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));

4404:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4405:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

4407:   PetscCall(PetscGetMemType(v, &memtype));
4408:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4409:     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4410:     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4411:   }

4413:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4414:   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4416:   PetscCall(PetscLogGpuTimeBegin());
4417:   if (Annz) {
4418:     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4419:     PetscCallCUDA(cudaPeekAtLastError());
4420:   }
4421:   PetscCall(PetscLogGpuTimeEnd());

4423:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4424:   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4426:   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4427:   PetscFunctionReturn(PETSC_SUCCESS);
4428: }

4430: /*@C
4431:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4433:   Not Collective

4435:   Input Parameters:
4436: + A          - the matrix
4437: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4439:   Output Parameters:
4440: + i - the CSR row pointers
4441: - j - the CSR column indices

4443:   Level: developer

4445:   Note:
4446:   When compressed is true, the CSR structure does not contain empty rows

4448: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4449: @*/
4450: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4451: {
4452:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4453:   CsrMatrix          *csr;
4454:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4456:   PetscFunctionBegin;
4458:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4459:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4460:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4461:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4462:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4463:   csr = (CsrMatrix *)cusp->mat->mat;
4464:   if (i) {
4465:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4466:       if (!cusp->rowoffsets_gpu) {
4467:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4468:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4469:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4470:       }
4471:       *i = cusp->rowoffsets_gpu->data().get();
4472:     } else *i = csr->row_offsets->data().get();
4473:   }
4474:   if (j) *j = csr->column_indices->data().get();
4475:   PetscFunctionReturn(PETSC_SUCCESS);
4476: }

4478: /*@C
4479:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4481:   Not Collective

4483:   Input Parameters:
4484: + A          - the matrix
4485: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4486: . i          - the CSR row pointers
4487: - j          - the CSR column indices

4489:   Level: developer

4491: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4492: @*/
4493: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4494: {
4495:   PetscFunctionBegin;
4497:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4498:   if (i) *i = NULL;
4499:   if (j) *j = NULL;
4500:   (void)compressed;
4501:   PetscFunctionReturn(PETSC_SUCCESS);
4502: }

4504: /*@C
4505:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4507:   Not Collective

4509:   Input Parameter:
4510: . A - a `MATSEQAIJCUSPARSE` matrix

4512:   Output Parameter:
4513: . a - pointer to the device data

4515:   Level: developer

4517:   Note:
4518:   May trigger host-device copies if up-to-date matrix data is on host

4520: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4521: @*/
4522: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4523: {
4524:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4525:   CsrMatrix          *csr;

4527:   PetscFunctionBegin;
4529:   PetscAssertPointer(a, 2);
4530:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4531:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4532:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4533:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534:   csr = (CsrMatrix *)cusp->mat->mat;
4535:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536:   *a = csr->values->data().get();
4537:   PetscFunctionReturn(PETSC_SUCCESS);
4538: }

4540: /*@C
4541:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4543:   Not Collective

4545:   Input Parameters:
4546: + A - a `MATSEQAIJCUSPARSE` matrix
4547: - a - pointer to the device data

4549:   Level: developer

4551: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4552: @*/
4553: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554: {
4555:   PetscFunctionBegin;
4557:   PetscAssertPointer(a, 2);
4558:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559:   *a = NULL;
4560:   PetscFunctionReturn(PETSC_SUCCESS);
4561: }

4563: /*@C
4564:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4566:   Not Collective

4568:   Input Parameter:
4569: . A - a `MATSEQAIJCUSPARSE` matrix

4571:   Output Parameter:
4572: . a - pointer to the device data

4574:   Level: developer

4576:   Note:
4577:   May trigger host-device copies if up-to-date matrix data is on host

4579: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4580: @*/
4581: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582: {
4583:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584:   CsrMatrix          *csr;

4586:   PetscFunctionBegin;
4588:   PetscAssertPointer(a, 2);
4589:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4592:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593:   csr = (CsrMatrix *)cusp->mat->mat;
4594:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595:   *a             = csr->values->data().get();
4596:   A->offloadmask = PETSC_OFFLOAD_GPU;
4597:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598:   PetscFunctionReturn(PETSC_SUCCESS);
4599: }
4600: /*@C
4601:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4603:   Not Collective

4605:   Input Parameters:
4606: + A - a `MATSEQAIJCUSPARSE` matrix
4607: - a - pointer to the device data

4609:   Level: developer

4611: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4612: @*/
4613: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4614: {
4615:   PetscFunctionBegin;
4617:   PetscAssertPointer(a, 2);
4618:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4619:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4620:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4621:   *a = NULL;
4622:   PetscFunctionReturn(PETSC_SUCCESS);
4623: }

4625: /*@C
4626:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4628:   Not Collective

4630:   Input Parameter:
4631: . A - a `MATSEQAIJCUSPARSE` matrix

4633:   Output Parameter:
4634: . a - pointer to the device data

4636:   Level: developer

4638:   Note:
4639:   Does not trigger host-device copies and flags data validity on the GPU

4641: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4642: @*/
4643: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4644: {
4645:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4646:   CsrMatrix          *csr;

4648:   PetscFunctionBegin;
4650:   PetscAssertPointer(a, 2);
4651:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4652:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4653:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4654:   csr = (CsrMatrix *)cusp->mat->mat;
4655:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4656:   *a             = csr->values->data().get();
4657:   A->offloadmask = PETSC_OFFLOAD_GPU;
4658:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4659:   PetscFunctionReturn(PETSC_SUCCESS);
4660: }

4662: /*@C
4663:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4665:   Not Collective

4667:   Input Parameters:
4668: + A - a `MATSEQAIJCUSPARSE` matrix
4669: - a - pointer to the device data

4671:   Level: developer

4673: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4674: @*/
4675: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4676: {
4677:   PetscFunctionBegin;
4679:   PetscAssertPointer(a, 2);
4680:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4681:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4682:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4683:   *a = NULL;
4684:   PetscFunctionReturn(PETSC_SUCCESS);
4685: }

4687: struct IJCompare4 {
4688:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4689:   {
4690:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4691:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4692:     return false;
4693:   }
4694: };

4696: struct Shift {
4697:   int _shift;

4699:   Shift(int shift) : _shift(shift) { }
4700:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4701: };

4703: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4704: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4705: {
4706:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4707:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4708:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4709:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4710:   PetscInt                      Annz, Bnnz;
4711:   cusparseStatus_t              stat;
4712:   PetscInt                      i, m, n, zero = 0;

4714:   PetscFunctionBegin;
4717:   PetscAssertPointer(C, 4);
4718:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4719:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4720:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4721:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4722:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4723:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4724:   if (reuse == MAT_INITIAL_MATRIX) {
4725:     m = A->rmap->n;
4726:     n = A->cmap->n + B->cmap->n;
4727:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4728:     PetscCall(MatSetSizes(*C, m, n, m, n));
4729:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4730:     c                       = (Mat_SeqAIJ *)(*C)->data;
4731:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4732:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4733:     Ccsr                    = new CsrMatrix;
4734:     Cmat->cprowIndices      = NULL;
4735:     c->compressedrow.use    = PETSC_FALSE;
4736:     c->compressedrow.nrows  = 0;
4737:     c->compressedrow.i      = NULL;
4738:     c->compressedrow.rindex = NULL;
4739:     Ccusp->workVector       = NULL;
4740:     Ccusp->nrows            = m;
4741:     Ccusp->mat              = Cmat;
4742:     Ccusp->mat->mat         = Ccsr;
4743:     Ccsr->num_rows          = m;
4744:     Ccsr->num_cols          = n;
4745:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4746:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4747:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4748:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4749:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4750:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4751:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4752:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4753:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4754:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4755:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4756:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4757:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4759:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4760:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4761:     Annz                 = (PetscInt)Acsr->column_indices->size();
4762:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4763:     c->nz                = Annz + Bnnz;
4764:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4765:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4766:     Ccsr->values         = new THRUSTARRAY(c->nz);
4767:     Ccsr->num_entries    = c->nz;
4768:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4769:     if (c->nz) {
4770:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4771:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4772:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4773:       THRUSTINTARRAY32 *Aroff, *Broff;

4775:       if (a->compressedrow.use) { /* need full row offset */
4776:         if (!Acusp->rowoffsets_gpu) {
4777:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4778:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4779:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4780:         }
4781:         Aroff = Acusp->rowoffsets_gpu;
4782:       } else Aroff = Acsr->row_offsets;
4783:       if (b->compressedrow.use) { /* need full row offset */
4784:         if (!Bcusp->rowoffsets_gpu) {
4785:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4786:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4787:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4788:         }
4789:         Broff = Bcusp->rowoffsets_gpu;
4790:       } else Broff = Bcsr->row_offsets;
4791:       PetscCall(PetscLogGpuTimeBegin());
4792:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4793:       PetscCallCUSPARSE(stat);
4794:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4795:       PetscCallCUSPARSE(stat);
4796:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4797:       auto Aperm = thrust::make_constant_iterator(1);
4798:       auto Bperm = thrust::make_constant_iterator(0);
4799: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4800:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4801:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4802: #else
4803:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4804:       auto Bcib = Bcsr->column_indices->begin();
4805:       auto Bcie = Bcsr->column_indices->end();
4806:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4807: #endif
4808:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4809:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4810:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4811:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4812:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4813:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4814:       auto p1    = Ccusp->coords->begin();
4815:       auto p2    = Ccusp->coords->begin();
4816:       thrust::advance(p2, Annz);
4817:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4818: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4819:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4820: #endif
4821:       auto cci = thrust::make_counting_iterator(zero);
4822:       auto cce = thrust::make_counting_iterator(c->nz);
4823: #if 0 //Errors on SUMMIT cuda 11.1.0
4824:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4825: #else
4826:   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4827:       auto pred = thrust::identity<int>();
4828:   #else
4829:       auto pred = cuda::std::identity();
4830:   #endif
4831:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4833: #endif
4834:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4835:       PetscCallCUSPARSE(stat);
4836:       PetscCall(PetscLogGpuTimeEnd());
4837:       delete wPerm;
4838:       delete Acoo;
4839:       delete Bcoo;
4840:       delete Ccoo;
4841: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4842:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4843:       PetscCallCUSPARSE(stat);
4844: #endif
4845:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4846:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4847:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850:         CsrMatrix                    *CcsrT = new CsrMatrix;
4851:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4854:         (*C)->form_explicit_transpose = PETSC_TRUE;
4855:         (*C)->transupdated            = PETSC_TRUE;
4856:         Ccusp->rowoffsets_gpu         = NULL;
4857:         CmatT->cprowIndices           = NULL;
4858:         CmatT->mat                    = CcsrT;
4859:         CcsrT->num_rows               = n;
4860:         CcsrT->num_cols               = m;
4861:         CcsrT->num_entries            = c->nz;

4863:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4864:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865:         CcsrT->values         = new THRUSTARRAY(c->nz);

4867:         PetscCall(PetscLogGpuTimeBegin());
4868:         auto rT = CcsrT->row_offsets->begin();
4869:         if (AT) {
4870:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871:           thrust::advance(rT, -1);
4872:         }
4873:         if (BT) {
4874:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876:           thrust::copy(titb, tite, rT);
4877:         }
4878:         auto cT = CcsrT->column_indices->begin();
4879:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881:         auto vT = CcsrT->values->begin();
4882:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4884:         PetscCall(PetscLogGpuTimeEnd());

4886:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4887:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4888:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4889:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4890:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4891:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4892:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4893:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4894:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4896:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4897:         PetscCallCUSPARSE(stat);
4898: #endif
4899:         Ccusp->matTranspose = CmatT;
4900:       }
4901:     }

4903:     c->free_a = PETSC_TRUE;
4904:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4905:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4906:     c->free_ij = PETSC_TRUE;
4907:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4908:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4909:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4910:       ii = *Ccsr->row_offsets;
4911:       jj = *Ccsr->column_indices;
4912:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4913:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4914:     } else {
4915:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4916:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917:     }
4918:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4919:     PetscCall(PetscMalloc1(m, &c->ilen));
4920:     PetscCall(PetscMalloc1(m, &c->imax));
4921:     c->maxnz         = c->nz;
4922:     c->nonzerorowcnt = 0;
4923:     c->rmax          = 0;
4924:     for (i = 0; i < m; i++) {
4925:       const PetscInt nn = c->i[i + 1] - c->i[i];
4926:       c->ilen[i] = c->imax[i] = nn;
4927:       c->nonzerorowcnt += (PetscInt)!!nn;
4928:       c->rmax = PetscMax(c->rmax, nn);
4929:     }
4930:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4931:     PetscCall(PetscMalloc1(c->nz, &c->a));
4932:     (*C)->nonzerostate++;
4933:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4934:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4935:     Ccusp->nonzerostate = (*C)->nonzerostate;
4936:     (*C)->preallocated  = PETSC_TRUE;
4937:   } else {
4938:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4939:     c = (Mat_SeqAIJ *)(*C)->data;
4940:     if (c->nz) {
4941:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4942:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4943:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4944:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4945:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4946:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4947:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4948:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4949:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4950:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4951:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4952:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4953:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4954:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4955:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4956:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4957:       auto pmid = Ccusp->coords->begin();
4958:       thrust::advance(pmid, Acsr->num_entries);
4959:       PetscCall(PetscLogGpuTimeBegin());
4960:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4961:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4962:       thrust::for_each(zibait, zieait, VecCUDAEquals());
4963:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4964:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4965:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4966:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4967:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4968:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4969:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4970:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4971:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4972:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4973:         auto       vT    = CcsrT->values->begin();
4974:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4975:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4976:         (*C)->transupdated = PETSC_TRUE;
4977:       }
4978:       PetscCall(PetscLogGpuTimeEnd());
4979:     }
4980:   }
4981:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4982:   (*C)->assembled     = PETSC_TRUE;
4983:   (*C)->was_assembled = PETSC_FALSE;
4984:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4985:   PetscFunctionReturn(PETSC_SUCCESS);
4986: }

4988: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4989: {
4990:   bool               dmem;
4991:   const PetscScalar *av;

4993:   PetscFunctionBegin;
4994:   dmem = isCudaMem(v);
4995:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4996:   if (n && idx) {
4997:     THRUSTINTARRAY widx(n);
4998:     widx.assign(idx, idx + n);
4999:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

5001:     THRUSTARRAY                    *w = NULL;
5002:     thrust::device_ptr<PetscScalar> dv;
5003:     if (dmem) {
5004:       dv = thrust::device_pointer_cast(v);
5005:     } else {
5006:       w  = new THRUSTARRAY(n);
5007:       dv = w->data();
5008:     }
5009:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

5011:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5012:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5013:     thrust::for_each(zibit, zieit, VecCUDAEquals());
5014:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5015:     delete w;
5016:   } else {
5017:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5018:   }
5019:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5020:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5021:   PetscFunctionReturn(PETSC_SUCCESS);
5022: }
5023: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()