Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18: #endif
 19: #include <thrust/iterator/constant_iterator.h>
 20: #include <thrust/remove.h>
 21: #include <thrust/sort.h>
 22: #include <thrust/unique.h>
 23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
 24:   #include <cuda/std/functional>
 25: #endif

 27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 29: /*
 30:   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 31:   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
 32: */
 33: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 34: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 35: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 36: #endif

 38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 42: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
 43: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 44: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 45: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 46: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 47: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 48: #endif
 49: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 50: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 51: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 52: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
 53: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 54: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 55: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 56: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 57: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 58: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 59: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 61: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 62: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 63: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 64: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 66: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 67: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 69: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 70: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 71: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 73: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 74: {
 75:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

 77:   PetscFunctionBegin;
 78:   switch (op) {
 79:   case MAT_CUSPARSE_MULT:
 80:     cusparsestruct->format = format;
 81:     break;
 82:   case MAT_CUSPARSE_ALL:
 83:     cusparsestruct->format = format;
 84:     break;
 85:   default:
 86:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
 87:   }
 88:   PetscFunctionReturn(PETSC_SUCCESS);
 89: }

 91: /*@
 92:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
 93:   operation. Only the `MatMult()` operation can use different GPU storage formats

 95:   Not Collective

 97:   Input Parameters:
 98: + A      - Matrix of type `MATSEQAIJCUSPARSE`
 99: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
100:            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
101: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

103:   Level: intermediate

105: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
106: @*/
107: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
108: {
109:   PetscFunctionBegin;
111:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
112:   PetscFunctionReturn(PETSC_SUCCESS);
113: }

115: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
116: {
117:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

119:   PetscFunctionBegin;
120:   cusparsestruct->use_cpu_solve = use_cpu;
121:   PetscFunctionReturn(PETSC_SUCCESS);
122: }

124: /*@
125:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

127:   Input Parameters:
128: + A       - Matrix of type `MATSEQAIJCUSPARSE`
129: - use_cpu - set flag for using the built-in CPU `MatSolve()`

131:   Level: intermediate

133:   Note:
134:   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
135:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
136:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

138: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
139: @*/
140: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
141: {
142:   PetscFunctionBegin;
144:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
145:   PetscFunctionReturn(PETSC_SUCCESS);
146: }

148: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
149: {
150:   PetscFunctionBegin;
151:   switch (op) {
152:   case MAT_FORM_EXPLICIT_TRANSPOSE:
153:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
154:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
155:     A->form_explicit_transpose = flg;
156:     break;
157:   default:
158:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
159:     break;
160:   }
161:   PetscFunctionReturn(PETSC_SUCCESS);
162: }

164: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
165: {
166:   MatCUSPARSEStorageFormat format;
167:   PetscBool                flg;
168:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

170:   PetscFunctionBegin;
171:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
172:   if (A->factortype == MAT_FACTOR_NONE) {
173:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
174:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

176:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
177:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
178:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
179:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
180: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
181:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
182:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
183:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
184:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
185:   #else
186:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
187:   #endif
188:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
189:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

191:     PetscCall(
192:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
193:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
194: #endif
195:   }
196:   PetscOptionsHeadEnd();
197:   PetscFunctionReturn(PETSC_SUCCESS);
198: }

200: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
201: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
202: {
203:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
204:   PetscInt                      m  = A->rmap->n;
205:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
206:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
207:   const MatScalar              *Aa = a->a;
208:   PetscInt                     *Mi, *Mj, Mnz;
209:   PetscScalar                  *Ma;

211:   PetscFunctionBegin;
212:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
213:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
214:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
215:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
216:       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
217:       PetscCall(PetscMalloc1(m + 1, &Mi));
218:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
219:       PetscCall(PetscMalloc1(Mnz, &Ma));
220:       Mi[0] = 0;
221:       for (PetscInt i = 0; i < m; i++) {
222:         PetscInt llen = Ai[i + 1] - Ai[i];
223:         PetscInt ulen = adiag[i] - adiag[i + 1];
224:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
225:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
226:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
227:         Mi[i + 1] = Mi[i] + llen + ulen;
228:       }
229:       // Copy M (L,U) from host to device
230:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
231:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
232:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
233:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
234:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

236:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
237:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
238:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
239:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
240:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
241:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
242:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
243:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

245:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
246:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
247:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

249:       fillMode = CUSPARSE_FILL_MODE_UPPER;
250:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
251:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
252:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
253:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

255:       // Allocate work vectors in SpSv
256:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
257:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

259:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
260:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

262:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
263:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
264:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
265:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
266:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
267:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
268:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

270:       // Record for reuse
271:       fs->csrRowPtr_h = Mi;
272:       fs->csrVal_h    = Ma;
273:       PetscCall(PetscFree(Mj));
274:     }
275:     // Copy the value
276:     Mi  = fs->csrRowPtr_h;
277:     Ma  = fs->csrVal_h;
278:     Mnz = Mi[m];
279:     for (PetscInt i = 0; i < m; i++) {
280:       PetscInt llen = Ai[i + 1] - Ai[i];
281:       PetscInt ulen = adiag[i] - adiag[i + 1];
282:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
283:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
284:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
285:     }
286:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

288:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
289:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
290:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
291:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
293:     } else
294:   #endif
295:     {
296:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
297:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

299:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
300:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
301:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
302:     }
303:   }
304:   PetscFunctionReturn(PETSC_SUCCESS);
305: }
306: #else
307: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
308: {
309:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
310:   PetscInt                           n                  = A->rmap->n;
311:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
312:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
313:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
314:   const MatScalar                   *aa = a->a, *v;
315:   PetscInt                          *AiLo, *AjLo;
316:   PetscInt                           i, nz, nzLower, offset, rowOffset;

318:   PetscFunctionBegin;
319:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
320:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
321:     try {
322:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
323:       nzLower = n + ai[n] - ai[1];
324:       if (!loTriFactor) {
325:         PetscScalar *AALo;

327:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

329:         /* Allocate Space for the lower triangular matrix */
330:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
331:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

333:         /* Fill the lower triangular matrix */
334:         AiLo[0]   = (PetscInt)0;
335:         AiLo[n]   = nzLower;
336:         AjLo[0]   = (PetscInt)0;
337:         AALo[0]   = (MatScalar)1.0;
338:         v         = aa;
339:         vi        = aj;
340:         offset    = 1;
341:         rowOffset = 1;
342:         for (i = 1; i < n; i++) {
343:           nz = ai[i + 1] - ai[i];
344:           /* additional 1 for the term on the diagonal */
345:           AiLo[i] = rowOffset;
346:           rowOffset += nz + 1;

348:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
349:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));

351:           offset += nz;
352:           AjLo[offset] = (PetscInt)i;
353:           AALo[offset] = (MatScalar)1.0;
354:           offset += 1;

356:           v += nz;
357:           vi += nz;
358:         }

360:         /* allocate space for the triangular factor information */
361:         PetscCall(PetscNew(&loTriFactor));
362:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
363:         /* Create the matrix description */
364:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
365:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
366:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
367:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
368:   #else
369:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
370:   #endif
371:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
372:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

374:         /* set the operation */
375:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

377:         /* set the matrix */
378:         loTriFactor->csrMat              = new CsrMatrix;
379:         loTriFactor->csrMat->num_rows    = n;
380:         loTriFactor->csrMat->num_cols    = n;
381:         loTriFactor->csrMat->num_entries = nzLower;

383:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
384:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

386:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
387:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

389:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
390:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

392:         /* Create the solve analysis information */
393:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
394:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
395:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
396:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
397:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
398:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
399:   #endif

401:         /* perform the solve analysis */
402:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
403:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
404:         PetscCallCUDA(WaitForCUDA());
405:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

407:         /* assign the pointer */
408:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
409:         loTriFactor->AA_h                                          = AALo;
410:         PetscCallCUDA(cudaFreeHost(AiLo));
411:         PetscCallCUDA(cudaFreeHost(AjLo));
412:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
413:       } else { /* update values only */
414:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
415:         /* Fill the lower triangular matrix */
416:         loTriFactor->AA_h[0] = 1.0;
417:         v                    = aa;
418:         vi                   = aj;
419:         offset               = 1;
420:         for (i = 1; i < n; i++) {
421:           nz = ai[i + 1] - ai[i];
422:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
423:           offset += nz;
424:           loTriFactor->AA_h[offset] = 1.0;
425:           offset += 1;
426:           v += nz;
427:         }
428:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
429:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
430:       }
431:     } catch (char *ex) {
432:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
433:     }
434:   }
435:   PetscFunctionReturn(PETSC_SUCCESS);
436: }

438: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
439: {
440:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
441:   PetscInt                           n                  = A->rmap->n;
442:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
443:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
444:   const PetscInt                    *aj                 = a->j, *adiag, *vi;
445:   const MatScalar                   *aa                 = a->a, *v;
446:   PetscInt                          *AiUp, *AjUp;
447:   PetscInt                           i, nz, nzUpper, offset;

449:   PetscFunctionBegin;
450:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
451:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
452:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
453:     try {
454:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
455:       nzUpper = adiag[0] - adiag[n];
456:       if (!upTriFactor) {
457:         PetscScalar *AAUp;

459:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

461:         /* Allocate Space for the upper triangular matrix */
462:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
463:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

465:         /* Fill the upper triangular matrix */
466:         AiUp[0] = (PetscInt)0;
467:         AiUp[n] = nzUpper;
468:         offset  = nzUpper;
469:         for (i = n - 1; i >= 0; i--) {
470:           v  = aa + adiag[i + 1] + 1;
471:           vi = aj + adiag[i + 1] + 1;

473:           /* number of elements NOT on the diagonal */
474:           nz = adiag[i] - adiag[i + 1] - 1;

476:           /* decrement the offset */
477:           offset -= (nz + 1);

479:           /* first, set the diagonal elements */
480:           AjUp[offset] = (PetscInt)i;
481:           AAUp[offset] = (MatScalar)1. / v[nz];
482:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

484:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
485:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
486:         }

488:         /* allocate space for the triangular factor information */
489:         PetscCall(PetscNew(&upTriFactor));
490:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

492:         /* Create the matrix description */
493:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
494:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
495:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
496:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
497:   #else
498:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
499:   #endif
500:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
501:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

503:         /* set the operation */
504:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

506:         /* set the matrix */
507:         upTriFactor->csrMat              = new CsrMatrix;
508:         upTriFactor->csrMat->num_rows    = n;
509:         upTriFactor->csrMat->num_cols    = n;
510:         upTriFactor->csrMat->num_entries = nzUpper;

512:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
513:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

515:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
516:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

518:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
519:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

521:         /* Create the solve analysis information */
522:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
523:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
524:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
525:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
526:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
527:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
528:   #endif

530:         /* perform the solve analysis */
531:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
532:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

534:         PetscCallCUDA(WaitForCUDA());
535:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

537:         /* assign the pointer */
538:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
539:         upTriFactor->AA_h                                          = AAUp;
540:         PetscCallCUDA(cudaFreeHost(AiUp));
541:         PetscCallCUDA(cudaFreeHost(AjUp));
542:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
543:       } else {
544:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
545:         /* Fill the upper triangular matrix */
546:         offset = nzUpper;
547:         for (i = n - 1; i >= 0; i--) {
548:           v = aa + adiag[i + 1] + 1;

550:           /* number of elements NOT on the diagonal */
551:           nz = adiag[i] - adiag[i + 1] - 1;

553:           /* decrement the offset */
554:           offset -= (nz + 1);

556:           /* first, set the diagonal elements */
557:           upTriFactor->AA_h[offset] = 1. / v[nz];
558:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
559:         }
560:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
561:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
562:       }
563:     } catch (char *ex) {
564:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
565:     }
566:   }
567:   PetscFunctionReturn(PETSC_SUCCESS);
568: }
569: #endif

571: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
572: {
573:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
574:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
575:   IS                            isrow = a->row, isicol = a->icol;
576:   PetscBool                     row_identity, col_identity;
577:   PetscInt                      n = A->rmap->n;

579:   PetscFunctionBegin;
580:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
581: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
582:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
583: #else
584:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
585:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
586:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
587: #endif

589:   cusparseTriFactors->nnz = a->nz;

591:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
592:   /* lower triangular indices */
593:   PetscCall(ISIdentity(isrow, &row_identity));
594:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
595:     const PetscInt *r;

597:     PetscCall(ISGetIndices(isrow, &r));
598:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
599:     cusparseTriFactors->rpermIndices->assign(r, r + n);
600:     PetscCall(ISRestoreIndices(isrow, &r));
601:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
602:   }

604:   /* upper triangular indices */
605:   PetscCall(ISIdentity(isicol, &col_identity));
606:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
607:     const PetscInt *c;

609:     PetscCall(ISGetIndices(isicol, &c));
610:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
611:     cusparseTriFactors->cpermIndices->assign(c, c + n);
612:     PetscCall(ISRestoreIndices(isicol, &c));
613:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
614:   }
615:   PetscFunctionReturn(PETSC_SUCCESS);
616: }

618: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
619: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
620: {
621:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
622:   PetscInt                      m  = A->rmap->n;
623:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
624:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
625:   const MatScalar              *Aa = a->a;
626:   PetscInt                     *Mj, Mnz;
627:   PetscScalar                  *Ma, *D;

629:   PetscFunctionBegin;
630:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
631:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
632:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
633:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
634:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
635:       Mnz = Ai[m]; // Unz (with the unit diagonal)
636:       PetscCall(PetscMalloc1(Mnz, &Ma));
637:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
638:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
639:       for (PetscInt i = 0; i < m; i++) {
640:         PetscInt ulen = Ai[i + 1] - Ai[i];
641:         Mj[Ai[i]]     = i;                                              // diagonal entry
642:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
643:       }
644:       // Copy M (U) from host to device
645:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
646:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
647:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
648:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
649:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
650:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

652:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
653:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
654:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
655:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
656:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
657:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
658:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
659:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

661:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
662:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
663:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

665:       // Allocate work vectors in SpSv
666:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
667:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

669:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
670:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

672:       // Query buffer sizes for SpSV and then allocate buffers
673:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
674:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
675:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

677:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
678:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
679:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

681:       // Record for reuse
682:       fs->csrVal_h = Ma;
683:       fs->diag_h   = D;
684:       PetscCall(PetscFree(Mj));
685:     }
686:     // Copy the value
687:     Ma  = fs->csrVal_h;
688:     D   = fs->diag_h;
689:     Mnz = Ai[m];
690:     for (PetscInt i = 0; i < m; i++) {
691:       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
692:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
693:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
694:     }
695:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
696:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

698:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
699:     if (fs->updatedSpSVAnalysis) {
700:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
702:     } else
703:   #endif
704:     {
705:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
706:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
707:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
708:       fs->updatedSpSVAnalysis = PETSC_TRUE;
709:     }
710:   }
711:   PetscFunctionReturn(PETSC_SUCCESS);
712: }

714: // Solve Ut D U x = b
715: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
716: {
717:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
718:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
719:   const PetscScalar                    *barray;
720:   PetscScalar                          *xarray;
721:   thrust::device_ptr<const PetscScalar> bGPU;
722:   thrust::device_ptr<PetscScalar>       xGPU;
723:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
724:   PetscInt                              m   = A->rmap->n;

726:   PetscFunctionBegin;
727:   PetscCall(PetscLogGpuTimeBegin());
728:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
729:   PetscCall(VecCUDAGetArrayRead(b, &barray));
730:   xGPU = thrust::device_pointer_cast(xarray);
731:   bGPU = thrust::device_pointer_cast(barray);

733:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
734:   if (fs->rpermIndices) {
735:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
736:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
737:   } else {
738:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
739:   }

741:   // Solve Ut Y = X
742:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
743:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

745:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
746:   // It is basically a vector element-wise multiplication, but cublas does not have it!
747:   #if CCCL_VERSION >= 3001000
748:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
749:   #else
750:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751:   #endif

753:   // Solve U X = Y
754:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756:   } else {
757:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758:   }
759:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

761:   // Reorder X with the column permutation if needed, and put the result back to x
762:   if (fs->cpermIndices) {
763:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765:   }

767:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769:   PetscCall(PetscLogGpuTimeEnd());
770:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771:   PetscFunctionReturn(PETSC_SUCCESS);
772: }
773: #else
774: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775: {
776:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780:   PetscInt                          *AiUp, *AjUp;
781:   PetscScalar                       *AAUp;
782:   PetscScalar                       *AALo;
783:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786:   const MatScalar                   *aa = b->a, *v;

788:   PetscFunctionBegin;
789:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791:     try {
792:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794:       if (!upTriFactor && !loTriFactor) {
795:         /* Allocate Space for the upper triangular matrix */
796:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

799:         /* Fill the upper triangular matrix */
800:         AiUp[0] = (PetscInt)0;
801:         AiUp[n] = nzUpper;
802:         offset  = 0;
803:         for (i = 0; i < n; i++) {
804:           /* set the pointers */
805:           v  = aa + ai[i];
806:           vj = aj + ai[i];
807:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

809:           /* first, set the diagonal elements */
810:           AjUp[offset] = (PetscInt)i;
811:           AAUp[offset] = (MatScalar)1.0 / v[nz];
812:           AiUp[i]      = offset;
813:           AALo[offset] = (MatScalar)1.0 / v[nz];

815:           offset += 1;
816:           if (nz > 0) {
817:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819:             for (j = offset; j < offset + nz; j++) {
820:               AAUp[j] = -AAUp[j];
821:               AALo[j] = AAUp[j] / v[nz];
822:             }
823:             offset += nz;
824:           }
825:         }

827:         /* allocate space for the triangular factor information */
828:         PetscCall(PetscNew(&upTriFactor));
829:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

831:         /* Create the matrix description */
832:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836:   #else
837:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838:   #endif
839:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

842:         /* set the matrix */
843:         upTriFactor->csrMat              = new CsrMatrix;
844:         upTriFactor->csrMat->num_rows    = A->rmap->n;
845:         upTriFactor->csrMat->num_cols    = A->cmap->n;
846:         upTriFactor->csrMat->num_entries = a->nz;

848:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

851:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

854:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

857:         /* set the operation */
858:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

860:         /* Create the solve analysis information */
861:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867:   #endif

869:         /* perform the solve analysis */
870:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

873:         PetscCallCUDA(WaitForCUDA());
874:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

876:         /* assign the pointer */
877:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

879:         /* allocate space for the triangular factor information */
880:         PetscCall(PetscNew(&loTriFactor));
881:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

883:         /* Create the matrix description */
884:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888:   #else
889:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890:   #endif
891:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

894:         /* set the operation */
895:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

897:         /* set the matrix */
898:         loTriFactor->csrMat              = new CsrMatrix;
899:         loTriFactor->csrMat->num_rows    = A->rmap->n;
900:         loTriFactor->csrMat->num_cols    = A->cmap->n;
901:         loTriFactor->csrMat->num_entries = a->nz;

903:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

906:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

909:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

912:         /* Create the solve analysis information */
913:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919:   #endif

921:         /* perform the solve analysis */
922:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

925:         PetscCallCUDA(WaitForCUDA());
926:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

928:         /* assign the pointer */
929:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

931:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932:         PetscCallCUDA(cudaFreeHost(AiUp));
933:         PetscCallCUDA(cudaFreeHost(AjUp));
934:       } else {
935:         /* Fill the upper triangular matrix */
936:         offset = 0;
937:         for (i = 0; i < n; i++) {
938:           /* set the pointers */
939:           v  = aa + ai[i];
940:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

942:           /* first, set the diagonal elements */
943:           AAUp[offset] = 1.0 / v[nz];
944:           AALo[offset] = 1.0 / v[nz];

946:           offset += 1;
947:           if (nz > 0) {
948:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949:             for (j = offset; j < offset + nz; j++) {
950:               AAUp[j] = -AAUp[j];
951:               AALo[j] = AAUp[j] / v[nz];
952:             }
953:             offset += nz;
954:           }
955:         }
956:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961:       }
962:       PetscCallCUDA(cudaFreeHost(AAUp));
963:       PetscCallCUDA(cudaFreeHost(AALo));
964:     } catch (char *ex) {
965:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966:     }
967:   }
968:   PetscFunctionReturn(PETSC_SUCCESS);
969: }
970: #endif

972: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973: {
974:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976:   IS                            ip                 = a->row;
977:   PetscBool                     perm_identity;
978:   PetscInt                      n = A->rmap->n;

980:   PetscFunctionBegin;
981:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

983: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
985: #else
986:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988: #endif
989:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

991:   A->offloadmask = PETSC_OFFLOAD_BOTH;

993:   /* lower triangular indices */
994:   PetscCall(ISIdentity(ip, &perm_identity));
995:   if (!perm_identity) {
996:     IS              iip;
997:     const PetscInt *irip, *rip;

999:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000:     PetscCall(ISGetIndices(iip, &irip));
1001:     PetscCall(ISGetIndices(ip, &rip));
1002:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006:     PetscCall(ISRestoreIndices(iip, &irip));
1007:     PetscCall(ISDestroy(&iip));
1008:     PetscCall(ISRestoreIndices(ip, &rip));
1009:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010:   }
1011:   PetscFunctionReturn(PETSC_SUCCESS);
1012: }

1014: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015: {
1016:   PetscFunctionBegin;
1017:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019:   B->offloadmask = PETSC_OFFLOAD_CPU;

1021: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022:   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023:   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024: #else
1025:   /* determine which version of MatSolve needs to be used. */
1026:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027:   IS          ip = b->row;
1028:   PetscBool   perm_identity;

1030:   PetscCall(ISIdentity(ip, &perm_identity));
1031:   if (perm_identity) {
1032:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034:   } else {
1035:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037:   }
1038: #endif
1039:   B->ops->matsolve          = NULL;
1040:   B->ops->matsolvetranspose = NULL;

1042:   /* get the triangular factors */
1043:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044:   PetscFunctionReturn(PETSC_SUCCESS);
1045: }

1047: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049: {
1050:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055:   cusparseIndexBase_t                indexBase;
1056:   cusparseMatrixType_t               matrixType;
1057:   cusparseFillMode_t                 fillMode;
1058:   cusparseDiagType_t                 diagType;

1060:   PetscFunctionBegin;
1061:   /* allocate space for the transpose of the lower triangular factor */
1062:   PetscCall(PetscNew(&loTriFactorT));
1063:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1065:   /* set the matrix descriptors of the lower triangular factor */
1066:   matrixType = cusparseGetMatType(loTriFactor->descr);
1067:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1068:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

1071:   /* Create the matrix description */
1072:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

1078:   /* set the operation */
1079:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1081:   /* allocate GPU space for the CSC of the lower triangular factor*/
1082:   loTriFactorT->csrMat                 = new CsrMatrix;
1083:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1090:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096:   #endif

1098:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099:   {
1100:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105:   #else
1106:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107:   #endif
1108:     PetscCallCUSPARSE(stat);
1109:   }

1111:   PetscCallCUDA(WaitForCUDA());
1112:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1114:   /* Create the solve analysis information */
1115:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121:   #endif

1123:   /* perform the solve analysis */
1124:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1127:   PetscCallCUDA(WaitForCUDA());
1128:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1130:   /* assign the pointer */
1131:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1133:   /*********************************************/
1134:   /* Now the Transpose of the Upper Tri Factor */
1135:   /*********************************************/

1137:   /* allocate space for the transpose of the upper triangular factor */
1138:   PetscCall(PetscNew(&upTriFactorT));
1139:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1141:   /* set the matrix descriptors of the upper triangular factor */
1142:   matrixType = cusparseGetMatType(upTriFactor->descr);
1143:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1144:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

1147:   /* Create the matrix description */
1148:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

1154:   /* set the operation */
1155:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1157:   /* allocate GPU space for the CSC of the upper triangular factor*/
1158:   upTriFactorT->csrMat                 = new CsrMatrix;
1159:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1166:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172:   #endif

1174:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175:   {
1176:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181:   #else
1182:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183:   #endif
1184:     PetscCallCUSPARSE(stat);
1185:   }

1187:   PetscCallCUDA(WaitForCUDA());
1188:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1190:   /* Create the solve analysis information */
1191:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197:   #endif

1199:   /* perform the solve analysis */
1200:   /* christ, would it have killed you to put this stuff in a function????????? */
1201:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1204:   PetscCallCUDA(WaitForCUDA());
1205:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1207:   /* assign the pointer */
1208:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209:   PetscFunctionReturn(PETSC_SUCCESS);
1210: }
1211: #endif

1213: struct PetscScalarToPetscInt {
1214:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215: };

1217: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218: {
1219:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222:   cusparseStatus_t              stat;
1223:   cusparseIndexBase_t           indexBase;

1225:   PetscFunctionBegin;
1226:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233:   PetscCall(PetscLogGpuTimeBegin());
1234:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1242:     /* set alpha and beta */
1243:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1250:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251:       CsrMatrix *matrixT      = new CsrMatrix;
1252:       matstructT->mat         = matrixT;
1253:       matrixT->num_rows       = A->cmap->n;
1254:       matrixT->num_cols       = A->rmap->n;
1255:       matrixT->num_entries    = a->nz;
1256:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258:       matrixT->values         = new THRUSTARRAY(a->nz);

1260:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1263: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266:                                indexBase, cusparse_scalartype);
1267:       PetscCallCUSPARSE(stat);
1268:   #else
1269:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1272:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275:         */
1276:       if (matrixT->num_entries) {
1277:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278:         PetscCallCUSPARSE(stat);

1280:       } else {
1281:         matstructT->matDescr = NULL;
1282:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283:       }
1284:   #endif
1285: #endif
1286:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289: #else
1290:       CsrMatrix *temp  = new CsrMatrix;
1291:       CsrMatrix *tempT = new CsrMatrix;
1292:       /* First convert HYB to CSR */
1293:       temp->num_rows       = A->rmap->n;
1294:       temp->num_cols       = A->cmap->n;
1295:       temp->num_entries    = a->nz;
1296:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298:       temp->values         = new THRUSTARRAY(a->nz);

1300:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301:       PetscCallCUSPARSE(stat);

1303:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304:       tempT->num_rows       = A->rmap->n;
1305:       tempT->num_cols       = A->cmap->n;
1306:       tempT->num_entries    = a->nz;
1307:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309:       tempT->values         = new THRUSTARRAY(a->nz);

1311:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313:       PetscCallCUSPARSE(stat);

1315:       /* Last, convert CSC to HYB */
1316:       cusparseHybMat_t hybMat;
1317:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319:       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320:       PetscCallCUSPARSE(stat);

1322:       /* assign the pointer */
1323:       matstructT->mat = hybMat;
1324:       A->transupdated = PETSC_TRUE;
1325:       /* delete temporaries */
1326:       if (tempT) {
1327:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330:         delete (CsrMatrix *)tempT;
1331:       }
1332:       if (temp) {
1333:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336:         delete (CsrMatrix *)temp;
1337:       }
1338: #endif
1339:     }
1340:   }
1341:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356:     }
1357:     if (!cusparsestruct->csr2csc_i) {
1358:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1361:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363:       void  *csr2cscBuffer;
1364:       size_t csr2cscBufferSize;
1365:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367:       PetscCallCUSPARSE(stat);
1368:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369: #endif

1371:       if (matrix->num_entries) {
1372:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1376:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377:            should be filled with indexBase. So I just take a shortcut here.
1378:         */
1379:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382:         PetscCallCUSPARSE(stat);
1383: #else
1384:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385:         PetscCallCUSPARSE(stat);
1386: #endif
1387:       } else {
1388:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389:       }

1391:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395: #endif
1396:     }
1397:     PetscCallThrust(
1398:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399:   }
1400:   PetscCall(PetscLogGpuTimeEnd());
1401:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402:   /* the compressed row indices is not used for matTranspose */
1403:   matstructT->cprowIndices = NULL;
1404:   /* assign the pointer */
1405:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406:   A->transupdated                                = PETSC_TRUE;
1407:   PetscFunctionReturn(PETSC_SUCCESS);
1408: }

1410: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412: {
1413:   const PetscScalar                    *barray;
1414:   PetscScalar                          *xarray;
1415:   thrust::device_ptr<const PetscScalar> bGPU;
1416:   thrust::device_ptr<PetscScalar>       xGPU;
1417:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421:   PetscInt                              m   = A->rmap->n;

1423:   PetscFunctionBegin;
1424:   PetscCall(PetscLogGpuTimeBegin());
1425:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427:   xGPU = thrust::device_pointer_cast(xarray);
1428:   bGPU = thrust::device_pointer_cast(barray);

1430:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431:   if (fs->rpermIndices) {
1432:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434:   } else {
1435:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436:   }

1438:   // Solve L Y = X
1439:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

1443:   // Solve U X = Y
1444:   if (fs->cpermIndices) {
1445:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446:   } else {
1447:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448:   }
1449:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

1451:   // Reorder X with the column permutation if needed, and put the result back to x
1452:   if (fs->cpermIndices) {
1453:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455:   }
1456:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458:   PetscCall(PetscLogGpuTimeEnd());
1459:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460:   PetscFunctionReturn(PETSC_SUCCESS);
1461: }

1463: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464: {
1465:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467:   const PetscScalar                    *barray;
1468:   PetscScalar                          *xarray;
1469:   thrust::device_ptr<const PetscScalar> bGPU;
1470:   thrust::device_ptr<PetscScalar>       xGPU;
1471:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473:   PetscInt                              m   = A->rmap->n;

1475:   PetscFunctionBegin;
1476:   PetscCall(PetscLogGpuTimeBegin());
1477:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1482:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487:   }

1489:   if (!fs->updatedTransposeSpSVAnalysis) {
1490:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1492:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494:   }

1496:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498:   xGPU = thrust::device_pointer_cast(xarray);
1499:   bGPU = thrust::device_pointer_cast(barray);

1501:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502:   if (fs->rpermIndices) {
1503:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505:   } else {
1506:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507:   }

1509:   // Solve Ut Y = X
1510:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

1513:   // Solve Lt X = Y
1514:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516:   } else {
1517:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518:   }
1519:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

1521:   // Reorder X with the column permutation if needed, and put the result back to x
1522:   if (fs->cpermIndices) {
1523:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525:   }

1527:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529:   PetscCall(PetscLogGpuTimeEnd());
1530:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531:   PetscFunctionReturn(PETSC_SUCCESS);
1532: }
1533: #else
1534: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536: {
1537:   PetscInt                              n = xx->map->n;
1538:   const PetscScalar                    *barray;
1539:   PetscScalar                          *xarray;
1540:   thrust::device_ptr<const PetscScalar> bGPU;
1541:   thrust::device_ptr<PetscScalar>       xGPU;
1542:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1547:   PetscFunctionBegin;
1548:   /* Analyze the matrix and create the transpose ... on the fly */
1549:   if (!loTriFactorT && !upTriFactorT) {
1550:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553:   }

1555:   /* Get the GPU pointers */
1556:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558:   xGPU = thrust::device_pointer_cast(xarray);
1559:   bGPU = thrust::device_pointer_cast(barray);

1561:   PetscCall(PetscLogGpuTimeBegin());
1562:   /* First, reorder with the row permutation */
1563:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1565:   /* First, solve U */
1566:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1569:   /* Then, solve L */
1570:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1573:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1576:   /* Copy the temporary to the full solution. */
1577:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1579:   /* restore */
1580:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582:   PetscCall(PetscLogGpuTimeEnd());
1583:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584:   PetscFunctionReturn(PETSC_SUCCESS);
1585: }

1587: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588: {
1589:   const PetscScalar                 *barray;
1590:   PetscScalar                       *xarray;
1591:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1596:   PetscFunctionBegin;
1597:   /* Analyze the matrix and create the transpose ... on the fly */
1598:   if (!loTriFactorT && !upTriFactorT) {
1599:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602:   }

1604:   /* Get the GPU pointers */
1605:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1608:   PetscCall(PetscLogGpuTimeBegin());
1609:   /* First, solve U */
1610:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1613:   /* Then, solve L */
1614:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1617:   /* restore */
1618:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620:   PetscCall(PetscLogGpuTimeEnd());
1621:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622:   PetscFunctionReturn(PETSC_SUCCESS);
1623: }

1625: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626: {
1627:   const PetscScalar                    *barray;
1628:   PetscScalar                          *xarray;
1629:   thrust::device_ptr<const PetscScalar> bGPU;
1630:   thrust::device_ptr<PetscScalar>       xGPU;
1631:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1636:   PetscFunctionBegin;
1637:   /* Get the GPU pointers */
1638:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640:   xGPU = thrust::device_pointer_cast(xarray);
1641:   bGPU = thrust::device_pointer_cast(barray);

1643:   PetscCall(PetscLogGpuTimeBegin());
1644:   /* First, reorder with the row permutation */
1645:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1647:   /* Next, solve L */
1648:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1651:   /* Then, solve U */
1652:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1655:   /* Last, reorder with the column permutation */
1656:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1658:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660:   PetscCall(PetscLogGpuTimeEnd());
1661:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662:   PetscFunctionReturn(PETSC_SUCCESS);
1663: }

1665: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666: {
1667:   const PetscScalar                 *barray;
1668:   PetscScalar                       *xarray;
1669:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1674:   PetscFunctionBegin;
1675:   /* Get the GPU pointers */
1676:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1679:   PetscCall(PetscLogGpuTimeBegin());
1680:   /* First, solve L */
1681:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1684:   /* Next, solve U */
1685:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1688:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690:   PetscCall(PetscLogGpuTimeEnd());
1691:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692:   PetscFunctionReturn(PETSC_SUCCESS);
1693: }
1694: #endif

1696: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698: {
1699:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702:   CsrMatrix                    *Acsr;
1703:   PetscInt                      m, nz;
1704:   PetscBool                     flg;

1706:   PetscFunctionBegin;
1707:   if (PetscDefined(USE_DEBUG)) {
1708:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710:   }

1712:   /* Copy A's value to fact */
1713:   m  = fact->rmap->n;
1714:   nz = aij->nz;
1715:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1719:   PetscCall(PetscLogGpuTimeBegin());
1720:   /* Factorize fact inplace */
1721:   if (m)
1722:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724:   if (PetscDefined(USE_DEBUG)) {
1725:     int              numerical_zero;
1726:     cusparseStatus_t status;
1727:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729:   }

1731:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1732:   if (fs->updatedSpSVAnalysis) {
1733:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1734:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1735:   } else
1736:   #endif
1737:   {
1738:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1739:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1740:     */
1741:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1743:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1745:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1746:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1747:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1748:   }

1750:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1751:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1752:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1753:   fact->ops->matsolve          = NULL;
1754:   fact->ops->matsolvetranspose = NULL;
1755:   PetscCall(PetscLogGpuTimeEnd());
1756:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1757:   PetscFunctionReturn(PETSC_SUCCESS);
1758: }

1760: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1761: {
1762:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1763:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1764:   PetscInt                      m, nz;

1766:   PetscFunctionBegin;
1767:   if (PetscDefined(USE_DEBUG)) {
1768:     PetscBool flg, diagDense;

1770:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1771:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1772:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1773:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1774:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1775:   }

1777:   /* Free the old stale stuff */
1778:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1780:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1781:      but they will not be used. Allocate them just for easy debugging.
1782:    */
1783:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1785:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1786:   fact->factortype             = MAT_FACTOR_ILU;
1787:   fact->info.factor_mallocs    = 0;
1788:   fact->info.fill_ratio_given  = info->fill;
1789:   fact->info.fill_ratio_needed = 1.0;

1791:   aij->row = NULL;
1792:   aij->col = NULL;

1794:   /* ====================================================================== */
1795:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1796:   /* We'll do in-place factorization on fact                                */
1797:   /* ====================================================================== */
1798:   const int *Ai, *Aj;

1800:   m  = fact->rmap->n;
1801:   nz = aij->nz;

1803:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1804:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1805:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1806:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1807:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1808:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1810:   /* ====================================================================== */
1811:   /* Create descriptors for M, L, U                                         */
1812:   /* ====================================================================== */
1813:   cusparseFillMode_t fillMode;
1814:   cusparseDiagType_t diagType;

1816:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1817:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1818:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1820:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1821:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1822:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1823:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1824:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1825:   */
1826:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1827:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1828:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1829:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1830:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1832:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1833:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1834:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1835:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1836:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1838:   /* ========================================================================= */
1839:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1840:   /* ========================================================================= */
1841:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1842:   if (m)
1843:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1844:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

1846:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1847:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1849:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1850:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1852:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1853:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1855:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1856:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1858:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1859:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1860:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1861:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1862:    */
1863:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1864:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1865:     fs->spsvBuffer_L = fs->factBuffer_M;
1866:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1867:   } else {
1868:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1869:     fs->spsvBuffer_U = fs->factBuffer_M;
1870:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1871:   }

1873:   /* ========================================================================== */
1874:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1875:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1876:   /* ========================================================================== */
1877:   int              structural_zero;
1878:   cusparseStatus_t status;

1880:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1881:   if (m)
1882:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1883:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1884:   if (PetscDefined(USE_DEBUG)) {
1885:     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1886:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1887:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1888:   }

1890:   /* Estimate FLOPs of the numeric factorization */
1891:   {
1892:     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1893:     PetscInt       *Ai, nzRow, nzLeft;
1894:     const PetscInt *adiag;
1895:     PetscLogDouble  flops = 0.0;

1897:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1898:     Ai = Aseq->i;
1899:     for (PetscInt i = 0; i < m; i++) {
1900:       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1901:         nzRow  = Ai[i + 1] - Ai[i];
1902:         nzLeft = adiag[i] - Ai[i];
1903:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1904:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1905:         */
1906:         nzLeft = (nzRow - 1) / 2;
1907:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1908:       }
1909:     }
1910:     fs->numericFactFlops = flops;
1911:   }
1912:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1913:   PetscFunctionReturn(PETSC_SUCCESS);
1914: }

1916: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1917: {
1918:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1919:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1920:   const PetscScalar            *barray;
1921:   PetscScalar                  *xarray;

1923:   PetscFunctionBegin;
1924:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1925:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1926:   PetscCall(PetscLogGpuTimeBegin());

1928:   /* Solve L*y = b */
1929:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1930:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1931:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1932:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1934:   /* Solve Lt*x = y */
1935:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1936:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1937:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1939:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1940:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1942:   PetscCall(PetscLogGpuTimeEnd());
1943:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1944:   PetscFunctionReturn(PETSC_SUCCESS);
1945: }

1947: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1948: {
1949:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1950:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1951:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1952:   CsrMatrix                    *Acsr;
1953:   PetscInt                      m, nz;
1954:   PetscBool                     flg;

1956:   PetscFunctionBegin;
1957:   if (PetscDefined(USE_DEBUG)) {
1958:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1959:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1960:   }

1962:   /* Copy A's value to fact */
1963:   m  = fact->rmap->n;
1964:   nz = aij->nz;
1965:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1966:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1967:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1969:   /* Factorize fact inplace */
1970:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1971:      csric02() only takes the lower triangular part of matrix A to perform factorization.
1972:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1973:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1974:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1975:    */
1976:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1977:   if (PetscDefined(USE_DEBUG)) {
1978:     int              numerical_zero;
1979:     cusparseStatus_t status;
1980:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1981:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1982:   }

1984:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1985:   if (fs->updatedSpSVAnalysis) {
1986:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1987:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1988:   } else
1989:   #endif
1990:   {
1991:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1993:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1994:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1995:   */
1996:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1997:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1998:   }

2000:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2001:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2002:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2003:   fact->ops->matsolve          = NULL;
2004:   fact->ops->matsolvetranspose = NULL;
2005:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2006:   PetscFunctionReturn(PETSC_SUCCESS);
2007: }

2009: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2010: {
2011:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2012:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2013:   PetscInt                      m, nz;

2015:   PetscFunctionBegin;
2016:   if (PetscDefined(USE_DEBUG)) {
2017:     PetscBool flg, diagDense;

2019:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2020:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2021:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2022:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2023:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2024:   }

2026:   /* Free the old stale stuff */
2027:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

2029:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2030:      but they will not be used. Allocate them just for easy debugging.
2031:    */
2032:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

2034:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2035:   fact->factortype             = MAT_FACTOR_ICC;
2036:   fact->info.factor_mallocs    = 0;
2037:   fact->info.fill_ratio_given  = info->fill;
2038:   fact->info.fill_ratio_needed = 1.0;

2040:   aij->row = NULL;
2041:   aij->col = NULL;

2043:   /* ====================================================================== */
2044:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2045:   /* We'll do in-place factorization on fact                                */
2046:   /* ====================================================================== */
2047:   const int *Ai, *Aj;

2049:   m  = fact->rmap->n;
2050:   nz = aij->nz;

2052:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2053:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2054:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2055:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2056:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

2059:   /* ====================================================================== */
2060:   /* Create mat descriptors for M, L                                        */
2061:   /* ====================================================================== */
2062:   cusparseFillMode_t fillMode;
2063:   cusparseDiagType_t diagType;

2065:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2066:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2067:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

2069:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2070:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2071:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2072:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2073:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2074:   */
2075:   fillMode = CUSPARSE_FILL_MODE_LOWER;
2076:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2077:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2078:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2079:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

2081:   /* ========================================================================= */
2082:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2083:   /* ========================================================================= */
2084:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2085:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

2087:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2088:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

2090:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2091:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

2093:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2094:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

2096:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2097:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

2099:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2100:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2101:    */
2102:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2103:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2104:     fs->spsvBuffer_L = fs->factBuffer_M;
2105:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2106:   } else {
2107:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2108:     fs->spsvBuffer_Lt = fs->factBuffer_M;
2109:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2110:   }

2112:   /* ========================================================================== */
2113:   /* Perform analysis of ic0 on M                                               */
2114:   /* The lower triangular part of M has the same sparsity pattern as L          */
2115:   /* ========================================================================== */
2116:   int              structural_zero;
2117:   cusparseStatus_t status;

2119:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2120:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2121:   if (PetscDefined(USE_DEBUG)) {
2122:     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2123:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2124:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2125:   }

2127:   /* Estimate FLOPs of the numeric factorization */
2128:   {
2129:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2130:     PetscInt      *Ai, nzRow, nzLeft;
2131:     PetscLogDouble flops = 0.0;

2133:     Ai = Aseq->i;
2134:     for (PetscInt i = 0; i < m; i++) {
2135:       nzRow = Ai[i + 1] - Ai[i];
2136:       if (nzRow > 1) {
2137:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2138:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2139:         */
2140:         nzLeft = (nzRow - 1) / 2;
2141:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2142:       }
2143:     }
2144:     fs->numericFactFlops = flops;
2145:   }
2146:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2147:   PetscFunctionReturn(PETSC_SUCCESS);
2148: }
2149: #endif

2151: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2152: {
2153:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2154:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

2156:   PetscFunctionBegin;
2157:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2158:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2159:   B->offloadmask = PETSC_OFFLOAD_CPU;

2161:   if (!cusparsestruct->use_cpu_solve) {
2162: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2163:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2164:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2165: #else
2166:     /* determine which version of MatSolve needs to be used. */
2167:     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2168:     IS          isrow = b->row, iscol = b->col;
2169:     PetscBool   row_identity, col_identity;

2171:     PetscCall(ISIdentity(isrow, &row_identity));
2172:     PetscCall(ISIdentity(iscol, &col_identity));
2173:     if (row_identity && col_identity) {
2174:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2175:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2176:     } else {
2177:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2178:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2179:     }
2180: #endif
2181:   }
2182:   B->ops->matsolve          = NULL;
2183:   B->ops->matsolvetranspose = NULL;

2185:   /* get the triangular factors */
2186:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2187:   PetscFunctionReturn(PETSC_SUCCESS);
2188: }

2190: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2191: {
2192:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

2194:   PetscFunctionBegin;
2195:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2196:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2197:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2198:   PetscFunctionReturn(PETSC_SUCCESS);
2199: }

2201: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2202: {
2203:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2205:   PetscFunctionBegin;
2206: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2207:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2208:   if (!info->factoronhost) {
2209:     PetscCall(ISIdentity(isrow, &row_identity));
2210:     PetscCall(ISIdentity(iscol, &col_identity));
2211:   }
2212:   if (!info->levels && row_identity && col_identity) {
2213:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2214:   } else
2215: #endif
2216:   {
2217:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2218:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2219:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2220:   }
2221:   PetscFunctionReturn(PETSC_SUCCESS);
2222: }

2224: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2225: {
2226:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2228:   PetscFunctionBegin;
2229: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2230:   PetscBool perm_identity = PETSC_FALSE;
2231:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2232:   if (!info->levels && perm_identity) {
2233:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2234:   } else
2235: #endif
2236:   {
2237:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2238:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2239:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2240:   }
2241:   PetscFunctionReturn(PETSC_SUCCESS);
2242: }

2244: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2245: {
2246:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2248:   PetscFunctionBegin;
2249:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2250:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2251:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2252:   PetscFunctionReturn(PETSC_SUCCESS);
2253: }

2255: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2256: {
2257:   PetscFunctionBegin;
2258:   *type = MATSOLVERCUSPARSE;
2259:   PetscFunctionReturn(PETSC_SUCCESS);
2260: }

2262: /*MC
2263:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2264:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2265:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2266:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2267:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2268:   algorithms are not recommended. This class does NOT support direct solver operations.

2270:   Level: beginner

2272: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2273:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2274: M*/

2276: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2277: {
2278:   PetscInt n = A->rmap->n;

2280:   PetscFunctionBegin;
2281:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2282:   PetscCall(MatSetSizes(*B, n, n, n, n));
2283:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2284:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

2286:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2287:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2288:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2289:     if (!A->boundtocpu) {
2290:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2291:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2292:     } else {
2293:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2294:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2295:     }
2296:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2297:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2298:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2299:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2300:     if (!A->boundtocpu) {
2301:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2302:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2303:     } else {
2304:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2305:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2306:     }
2307:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2308:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2309:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2311:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2312:   (*B)->canuseordering = PETSC_TRUE;
2313:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2314:   PetscFunctionReturn(PETSC_SUCCESS);
2315: }

2317: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2318: {
2319:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2320:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2321: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2322:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2323: #endif

2325:   PetscFunctionBegin;
2326:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2327:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2328:     if (A->factortype == MAT_FACTOR_NONE) {
2329:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2330:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331:     }
2332: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2333:     else if (fs->csrVal) {
2334:       /* We have a factorized matrix on device and are able to copy it to host */
2335:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2336:     }
2337: #endif
2338:     else
2339:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2340:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2341:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2342:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2343:   }
2344:   PetscFunctionReturn(PETSC_SUCCESS);
2345: }

2347: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2348: {
2349:   PetscFunctionBegin;
2350:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2351:   *array = ((Mat_SeqAIJ *)A->data)->a;
2352:   PetscFunctionReturn(PETSC_SUCCESS);
2353: }

2355: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2356: {
2357:   PetscFunctionBegin;
2358:   A->offloadmask = PETSC_OFFLOAD_CPU;
2359:   *array         = NULL;
2360:   PetscFunctionReturn(PETSC_SUCCESS);
2361: }

2363: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2364: {
2365:   PetscFunctionBegin;
2366:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2367:   *array = ((Mat_SeqAIJ *)A->data)->a;
2368:   PetscFunctionReturn(PETSC_SUCCESS);
2369: }

2371: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2372: {
2373:   PetscFunctionBegin;
2374:   *array = NULL;
2375:   PetscFunctionReturn(PETSC_SUCCESS);
2376: }

2378: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379: {
2380:   PetscFunctionBegin;
2381:   *array = ((Mat_SeqAIJ *)A->data)->a;
2382:   PetscFunctionReturn(PETSC_SUCCESS);
2383: }

2385: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2386: {
2387:   PetscFunctionBegin;
2388:   A->offloadmask = PETSC_OFFLOAD_CPU;
2389:   *array         = NULL;
2390:   PetscFunctionReturn(PETSC_SUCCESS);
2391: }

2393: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2394: {
2395:   Mat_SeqAIJCUSPARSE *cusp;
2396:   CsrMatrix          *matrix;

2398:   PetscFunctionBegin;
2399:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2400:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2401:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2402:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2403:   matrix = (CsrMatrix *)cusp->mat->mat;

2405:   if (i) {
2406: #if !defined(PETSC_USE_64BIT_INDICES)
2407:     *i = matrix->row_offsets->data().get();
2408: #else
2409:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410: #endif
2411:   }
2412:   if (j) {
2413: #if !defined(PETSC_USE_64BIT_INDICES)
2414:     *j = matrix->column_indices->data().get();
2415: #else
2416:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2417: #endif
2418:   }
2419:   if (a) *a = matrix->values->data().get();
2420:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2421:   PetscFunctionReturn(PETSC_SUCCESS);
2422: }

2424: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2425: {
2426:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2427:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2428:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2429:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2430:   cusparseStatus_t              stat;
2431:   PetscBool                     both = PETSC_TRUE;

2433:   PetscFunctionBegin;
2434:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2435:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2436:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2437:       CsrMatrix *matrix;
2438:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2440:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2441:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442:       matrix->values->assign(a->a, a->a + a->nz);
2443:       PetscCallCUDA(WaitForCUDA());
2444:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2445:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2446:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2447:     } else {
2448:       PetscInt nnz;
2449:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2450:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2451:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2452:       delete cusparsestruct->workVector;
2453:       delete cusparsestruct->rowoffsets_gpu;
2454:       cusparsestruct->workVector     = NULL;
2455:       cusparsestruct->rowoffsets_gpu = NULL;
2456:       try {
2457:         if (a->compressedrow.use) {
2458:           m    = a->compressedrow.nrows;
2459:           ii   = a->compressedrow.i;
2460:           ridx = a->compressedrow.rindex;
2461:         } else {
2462:           m    = A->rmap->n;
2463:           ii   = a->i;
2464:           ridx = NULL;
2465:         }
2466:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2467:         if (!a->a) {
2468:           nnz  = ii[m];
2469:           both = PETSC_FALSE;
2470:         } else nnz = a->nz;
2471:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2473:         /* create cusparse matrix */
2474:         cusparsestruct->nrows = m;
2475:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2476:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2477:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2478:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2480:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2481:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2482:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2483:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2485:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2486:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2488:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2489:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2490:           /* set the matrix */
2491:           CsrMatrix *mat   = new CsrMatrix;
2492:           mat->num_rows    = m;
2493:           mat->num_cols    = A->cmap->n;
2494:           mat->num_entries = nnz;
2495:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2496:           mat->row_offsets->assign(ii, ii + m + 1);
2497:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2498:           mat->column_indices->assign(a->j, a->j + nnz);

2500:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2501:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2503:           /* assign the pointer */
2504:           matstruct->mat = mat;
2505: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2506:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2507:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2508:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2509:             PetscCallCUSPARSE(stat);
2510:           }
2511: #endif
2512:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2513: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2514:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2515: #else
2516:           CsrMatrix *mat   = new CsrMatrix;
2517:           mat->num_rows    = m;
2518:           mat->num_cols    = A->cmap->n;
2519:           mat->num_entries = nnz;
2520:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2521:           mat->row_offsets->assign(ii, ii + m + 1);

2523:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2524:           mat->column_indices->assign(a->j, a->j + nnz);

2526:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2527:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2529:           cusparseHybMat_t hybMat;
2530:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2531:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2532:           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2533:           PetscCallCUSPARSE(stat);
2534:           /* assign the pointer */
2535:           matstruct->mat = hybMat;

2537:           if (mat) {
2538:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2539:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2540:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2541:             delete (CsrMatrix *)mat;
2542:           }
2543: #endif
2544:         }

2546:         /* assign the compressed row indices */
2547:         if (a->compressedrow.use) {
2548:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2549:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2550:           matstruct->cprowIndices->assign(ridx, ridx + m);
2551:           tmp = m;
2552:         } else {
2553:           cusparsestruct->workVector = NULL;
2554:           matstruct->cprowIndices    = NULL;
2555:           tmp                        = 0;
2556:         }
2557:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2559:         /* assign the pointer */
2560:         cusparsestruct->mat = matstruct;
2561:       } catch (char *ex) {
2562:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2563:       }
2564:       PetscCallCUDA(WaitForCUDA());
2565:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2566:       cusparsestruct->nonzerostate = A->nonzerostate;
2567:     }
2568:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2569:   }
2570:   PetscFunctionReturn(PETSC_SUCCESS);
2571: }

2573: struct VecCUDAPlusEquals {
2574:   template <typename Tuple>
2575:   __host__ __device__ void operator()(Tuple t)
2576:   {
2577:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2578:   }
2579: };

2581: struct VecCUDAEquals {
2582:   template <typename Tuple>
2583:   __host__ __device__ void operator()(Tuple t)
2584:   {
2585:     thrust::get<1>(t) = thrust::get<0>(t);
2586:   }
2587: };

2589: struct VecCUDAEqualsReverse {
2590:   template <typename Tuple>
2591:   __host__ __device__ void operator()(Tuple t)
2592:   {
2593:     thrust::get<0>(t) = thrust::get<1>(t);
2594:   }
2595: };

2597: struct MatProductCtx_MatMatCusparse {
2598:   PetscBool      cisdense;
2599:   PetscScalar   *Bt;
2600:   Mat            X;
2601:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2602:   PetscLogDouble flops;
2603:   CsrMatrix     *Bcsr;

2605: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2606:   cusparseSpMatDescr_t matSpBDescr;
2607:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2608:   cusparseDnMatDescr_t matBDescr;
2609:   cusparseDnMatDescr_t matCDescr;
2610:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2611:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2612:   void *dBuffer4;
2613:   void *dBuffer5;
2614:   #endif
2615:   size_t                mmBufferSize;
2616:   void                 *mmBuffer;
2617:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2618:   cusparseSpGEMMDescr_t spgemmDesc;
2619: #endif
2620: };

2622: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
2623: {
2624:   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;

2626:   PetscFunctionBegin;
2627:   PetscCallCUDA(cudaFree(mmdata->Bt));
2628:   delete mmdata->Bcsr;
2629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2630:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2631:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2632:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2633:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2634:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2635:   PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2636:   PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2637:   #endif
2638:   PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2639:   PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2640: #endif
2641:   PetscCall(MatDestroy(&mmdata->X));
2642:   PetscCall(PetscFree(mmdata));
2643:   PetscFunctionReturn(PETSC_SUCCESS);
2644: }

2646: #include <../src/mat/impls/dense/seq/dense.h>

2648: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2649: {
2650:   Mat_Product                  *product = C->product;
2651:   Mat                           A, B;
2652:   PetscInt                      m, n, blda, clda;
2653:   PetscBool                     flg, biscuda;
2654:   Mat_SeqAIJCUSPARSE           *cusp;
2655:   cusparseStatus_t              stat;
2656:   cusparseOperation_t           opA;
2657:   const PetscScalar            *barray;
2658:   PetscScalar                  *carray;
2659:   MatProductCtx_MatMatCusparse *mmdata;
2660:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2661:   CsrMatrix                    *csrmat;

2663:   PetscFunctionBegin;
2664:   MatCheckProduct(C, 1);
2665:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2666:   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2667:   A      = product->A;
2668:   B      = product->B;
2669:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2670:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2671:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2672:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2673:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2674:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2675:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2676:   switch (product->type) {
2677:   case MATPRODUCT_AB:
2678:   case MATPRODUCT_PtAP:
2679:     mat = cusp->mat;
2680:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2681:     m   = A->rmap->n;
2682:     n   = B->cmap->n;
2683:     break;
2684:   case MATPRODUCT_AtB:
2685:     if (!A->form_explicit_transpose) {
2686:       mat = cusp->mat;
2687:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2688:     } else {
2689:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2690:       mat = cusp->matTranspose;
2691:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2692:     }
2693:     m = A->cmap->n;
2694:     n = B->cmap->n;
2695:     break;
2696:   case MATPRODUCT_ABt:
2697:   case MATPRODUCT_RARt:
2698:     mat = cusp->mat;
2699:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2700:     m   = A->rmap->n;
2701:     n   = B->rmap->n;
2702:     break;
2703:   default:
2704:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2705:   }
2706:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2707:   csrmat = (CsrMatrix *)mat->mat;
2708:   /* if the user passed a CPU matrix, copy the data to the GPU */
2709:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2710:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2711:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2713:   PetscCall(MatDenseGetLDA(B, &blda));
2714:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2715:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2716:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2717:   } else {
2718:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2719:     PetscCall(MatDenseGetLDA(C, &clda));
2720:   }

2722:   PetscCall(PetscLogGpuTimeBegin());
2723: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2724:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2725:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2726:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2727:   #else
2728:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2729:   #endif

2731:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2732:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2733:     size_t mmBufferSize;
2734:     if (mmdata->initialized && mmdata->Blda != blda) {
2735:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2736:       mmdata->matBDescr = NULL;
2737:     }
2738:     if (!mmdata->matBDescr) {
2739:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2740:       mmdata->Blda = blda;
2741:     }

2743:     if (mmdata->initialized && mmdata->Clda != clda) {
2744:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2745:       mmdata->matCDescr = NULL;
2746:     }
2747:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2748:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2749:       mmdata->Clda = clda;
2750:     }

2752:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2753:     if (matADescr) {
2754:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2755:       matADescr = NULL;
2756:     }
2757:   #endif

2759:     if (!matADescr) {
2760:       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2761:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2762:       PetscCallCUSPARSE(stat);
2763:     }

2765:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

2767:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2768:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2769:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2770:       mmdata->mmBufferSize = mmBufferSize;
2771:     }

2773:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2774:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2775:   #endif

2777:     mmdata->initialized = PETSC_TRUE;
2778:   } else {
2779:     /* to be safe, always update pointers of the mats */
2780:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2781:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2782:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2783:   }

2785:   /* do cusparseSpMM, which supports transpose on B */
2786:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2787: #else
2788:   PetscInt k;
2789:   /* cusparseXcsrmm does not support transpose on B */
2790:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2791:     cublasHandle_t cublasv2handle;
2792:     cublasStatus_t cerr;

2794:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2795:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2796:     PetscCallCUBLAS(cerr);
2797:     blda = B->cmap->n;
2798:     k    = B->cmap->n;
2799:   } else {
2800:     k = B->rmap->n;
2801:   }

2803:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2804:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2805:   PetscCallCUSPARSE(stat);
2806: #endif
2807:   PetscCall(PetscLogGpuTimeEnd());
2808:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2809:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2810:   if (product->type == MATPRODUCT_RARt) {
2811:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2812:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2813:   } else if (product->type == MATPRODUCT_PtAP) {
2814:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2815:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2816:   } else {
2817:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2818:   }
2819:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2820:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2821:   PetscFunctionReturn(PETSC_SUCCESS);
2822: }

2824: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2825: {
2826:   Mat_Product                  *product = C->product;
2827:   Mat                           A, B;
2828:   PetscInt                      m, n;
2829:   PetscBool                     cisdense, flg;
2830:   MatProductCtx_MatMatCusparse *mmdata;
2831:   Mat_SeqAIJCUSPARSE           *cusp;

2833:   PetscFunctionBegin;
2834:   MatCheckProduct(C, 1);
2835:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2836:   A = product->A;
2837:   B = product->B;
2838:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2839:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2840:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2841:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2842:   switch (product->type) {
2843:   case MATPRODUCT_AB:
2844:     m = A->rmap->n;
2845:     n = B->cmap->n;
2846:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2847:     break;
2848:   case MATPRODUCT_AtB:
2849:     m = A->cmap->n;
2850:     n = B->cmap->n;
2851:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2852:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2853:     break;
2854:   case MATPRODUCT_ABt:
2855:     m = A->rmap->n;
2856:     n = B->rmap->n;
2857:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2858:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2859:     break;
2860:   case MATPRODUCT_PtAP:
2861:     m = B->cmap->n;
2862:     n = B->cmap->n;
2863:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2864:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2865:     break;
2866:   case MATPRODUCT_RARt:
2867:     m = B->rmap->n;
2868:     n = B->rmap->n;
2869:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2870:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2871:     break;
2872:   default:
2873:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2874:   }
2875:   PetscCall(MatSetSizes(C, m, n, m, n));
2876:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2877:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2878:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2880:   /* product data */
2881:   PetscCall(PetscNew(&mmdata));
2882:   mmdata->cisdense = cisdense;
2883: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2884:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2885:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2886: #endif
2887:   /* for these products we need intermediate storage */
2888:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2889:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2890:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2891:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2892:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2893:     } else {
2894:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2895:     }
2896:   }
2897:   C->product->data    = mmdata;
2898:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

2900:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2901:   PetscFunctionReturn(PETSC_SUCCESS);
2902: }

2904: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2905: {
2906:   Mat_Product                  *product = C->product;
2907:   Mat                           A, B;
2908:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2909:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2910:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2911:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2912:   PetscBool                     flg;
2913:   cusparseStatus_t              stat;
2914:   MatProductType                ptype;
2915:   MatProductCtx_MatMatCusparse *mmdata;
2916: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2917:   cusparseSpMatDescr_t BmatSpDescr;
2918: #endif
2919:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2921:   PetscFunctionBegin;
2922:   MatCheckProduct(C, 1);
2923:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2924:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2925:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2926:   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2927:   A      = product->A;
2928:   B      = product->B;
2929:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2930:     mmdata->reusesym = PETSC_FALSE;
2931:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2932:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2933:     Cmat = Ccusp->mat;
2934:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2935:     Ccsr = (CsrMatrix *)Cmat->mat;
2936:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2937:     goto finalize;
2938:   }
2939:   if (!c->nz) goto finalize;
2940:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2941:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2942:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2943:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2944:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2946:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2947:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2948:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2949:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2952:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2953:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2955:   ptype = product->type;
2956:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2957:     ptype = MATPRODUCT_AB;
2958:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2959:   }
2960:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2961:     ptype = MATPRODUCT_AB;
2962:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2963:   }
2964:   switch (ptype) {
2965:   case MATPRODUCT_AB:
2966:     Amat = Acusp->mat;
2967:     Bmat = Bcusp->mat;
2968:     break;
2969:   case MATPRODUCT_AtB:
2970:     Amat = Acusp->matTranspose;
2971:     Bmat = Bcusp->mat;
2972:     break;
2973:   case MATPRODUCT_ABt:
2974:     Amat = Acusp->mat;
2975:     Bmat = Bcusp->matTranspose;
2976:     break;
2977:   default:
2978:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2979:   }
2980:   Cmat = Ccusp->mat;
2981:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2982:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2983:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2984:   Acsr = (CsrMatrix *)Amat->mat;
2985:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2986:   Ccsr = (CsrMatrix *)Cmat->mat;
2987:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2988:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2989:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2990:   PetscCall(PetscLogGpuTimeBegin());
2991: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2992:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2993:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2994:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2995:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2996:   PetscCallCUSPARSE(stat);
2997:   #else
2998:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2999:   PetscCallCUSPARSE(stat);
3000:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3001:   PetscCallCUSPARSE(stat);
3002:   #endif
3003: #else
3004:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3005:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3006:   PetscCallCUSPARSE(stat);
3007: #endif
3008:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3009:   PetscCallCUDA(WaitForCUDA());
3010:   PetscCall(PetscLogGpuTimeEnd());
3011:   C->offloadmask = PETSC_OFFLOAD_GPU;
3012: finalize:
3013:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3014:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3015:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3016:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3017:   c->reallocs = 0;
3018:   C->info.mallocs += 0;
3019:   C->info.nz_unneeded = 0;
3020:   C->assembled = C->was_assembled = PETSC_TRUE;
3021:   C->num_ass++;
3022:   PetscFunctionReturn(PETSC_SUCCESS);
3023: }

3025: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3026: {
3027:   Mat_Product                  *product = C->product;
3028:   Mat                           A, B;
3029:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3030:   Mat_SeqAIJ                   *a, *b, *c;
3031:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3032:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3033:   PetscInt                      i, j, m, n, k;
3034:   PetscBool                     flg;
3035:   cusparseStatus_t              stat;
3036:   MatProductType                ptype;
3037:   MatProductCtx_MatMatCusparse *mmdata;
3038:   PetscLogDouble                flops;
3039:   PetscBool                     biscompressed, ciscompressed;
3040: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3041:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3042:   cusparseSpMatDescr_t BmatSpDescr;
3043: #else
3044:   int cnz;
3045: #endif
3046:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

3048:   PetscFunctionBegin;
3049:   MatCheckProduct(C, 1);
3050:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3051:   A = product->A;
3052:   B = product->B;
3053:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3054:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3055:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3056:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3057:   a = (Mat_SeqAIJ *)A->data;
3058:   b = (Mat_SeqAIJ *)B->data;
3059:   /* product data */
3060:   PetscCall(PetscNew(&mmdata));
3061:   C->product->data    = mmdata;
3062:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

3064:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3065:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3066:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3067:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3068:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

3071:   ptype = product->type;
3072:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3073:     ptype                                          = MATPRODUCT_AB;
3074:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3075:   }
3076:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3077:     ptype                                          = MATPRODUCT_AB;
3078:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3079:   }
3080:   biscompressed = PETSC_FALSE;
3081:   ciscompressed = PETSC_FALSE;
3082:   switch (ptype) {
3083:   case MATPRODUCT_AB:
3084:     m    = A->rmap->n;
3085:     n    = B->cmap->n;
3086:     k    = A->cmap->n;
3087:     Amat = Acusp->mat;
3088:     Bmat = Bcusp->mat;
3089:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3090:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3091:     break;
3092:   case MATPRODUCT_AtB:
3093:     m = A->cmap->n;
3094:     n = B->cmap->n;
3095:     k = A->rmap->n;
3096:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3097:     Amat = Acusp->matTranspose;
3098:     Bmat = Bcusp->mat;
3099:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3100:     break;
3101:   case MATPRODUCT_ABt:
3102:     m = A->rmap->n;
3103:     n = B->rmap->n;
3104:     k = A->cmap->n;
3105:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3106:     Amat = Acusp->mat;
3107:     Bmat = Bcusp->matTranspose;
3108:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3109:     break;
3110:   default:
3111:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3112:   }

3114:   /* create cusparse matrix */
3115:   PetscCall(MatSetSizes(C, m, n, m, n));
3116:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3117:   c     = (Mat_SeqAIJ *)C->data;
3118:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3119:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3120:   Ccsr  = new CsrMatrix;

3122:   c->compressedrow.use = ciscompressed;
3123:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3124:     c->compressedrow.nrows = a->compressedrow.nrows;
3125:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3126:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3127:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3128:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3129:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3130:   } else {
3131:     c->compressedrow.nrows  = 0;
3132:     c->compressedrow.i      = NULL;
3133:     c->compressedrow.rindex = NULL;
3134:     Ccusp->workVector       = NULL;
3135:     Cmat->cprowIndices      = NULL;
3136:   }
3137:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3138:   Ccusp->mat        = Cmat;
3139:   Ccusp->mat->mat   = Ccsr;
3140:   Ccsr->num_rows    = Ccusp->nrows;
3141:   Ccsr->num_cols    = n;
3142:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3143:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3144:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3145:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3146:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3147:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3148:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3149:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3152:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3153:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3154:     c->nz                = 0;
3155:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3156:     Ccsr->values         = new THRUSTARRAY(c->nz);
3157:     goto finalizesym;
3158:   }

3160:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3161:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3162:   Acsr = (CsrMatrix *)Amat->mat;
3163:   if (!biscompressed) {
3164:     Bcsr = (CsrMatrix *)Bmat->mat;
3165: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3166:     BmatSpDescr = Bmat->matDescr;
3167: #endif
3168:   } else { /* we need to use row offsets for the full matrix */
3169:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3170:     Bcsr                 = new CsrMatrix;
3171:     Bcsr->num_rows       = B->rmap->n;
3172:     Bcsr->num_cols       = cBcsr->num_cols;
3173:     Bcsr->num_entries    = cBcsr->num_entries;
3174:     Bcsr->column_indices = cBcsr->column_indices;
3175:     Bcsr->values         = cBcsr->values;
3176:     if (!Bcusp->rowoffsets_gpu) {
3177:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3178:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3179:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3180:     }
3181:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3182:     mmdata->Bcsr      = Bcsr;
3183: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184:     if (Bcsr->num_rows && Bcsr->num_cols) {
3185:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3186:       PetscCallCUSPARSE(stat);
3187:     }
3188:     BmatSpDescr = mmdata->matSpBDescr;
3189: #endif
3190:   }
3191:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3192:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3193:   /* precompute flops count */
3194:   if (ptype == MATPRODUCT_AB) {
3195:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3196:       const PetscInt st = a->i[i];
3197:       const PetscInt en = a->i[i + 1];
3198:       for (j = st; j < en; j++) {
3199:         const PetscInt brow = a->j[j];
3200:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3201:       }
3202:     }
3203:   } else if (ptype == MATPRODUCT_AtB) {
3204:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3205:       const PetscInt anzi = a->i[i + 1] - a->i[i];
3206:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3207:       flops += (2. * anzi) * bnzi;
3208:     }
3209:   } else { /* TODO */
3210:     flops = 0.;
3211:   }

3213:   mmdata->flops = flops;
3214:   PetscCall(PetscLogGpuTimeBegin());

3216: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3217:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3218:   // cuda-12.2 requires non-null csrRowOffsets
3219:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3220:   PetscCallCUSPARSE(stat);
3221:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3222:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3223:   {
3224:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3225:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3226:   */
3227:     void *dBuffer1 = NULL;
3228:     void *dBuffer2 = NULL;
3229:     void *dBuffer3 = NULL;
3230:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3231:     size_t bufferSize1 = 0;
3232:     size_t bufferSize2 = 0;
3233:     size_t bufferSize3 = 0;
3234:     size_t bufferSize4 = 0;
3235:     size_t bufferSize5 = 0;

3237:     /* ask bufferSize1 bytes for external memory */
3238:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3239:     PetscCallCUSPARSE(stat);
3240:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3241:     /* inspect the matrices A and B to understand the memory requirement for the next step */
3242:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3243:     PetscCallCUSPARSE(stat);

3245:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3246:     PetscCallCUSPARSE(stat);
3247:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3248:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3249:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3250:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3251:     PetscCallCUSPARSE(stat);
3252:     PetscCallCUDA(cudaFree(dBuffer1));
3253:     PetscCallCUDA(cudaFree(dBuffer2));

3255:     /* get matrix C non-zero entries C_nnz1 */
3256:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3257:     c->nz = (PetscInt)C_nnz1;
3258:     /* allocate matrix C */
3259:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3260:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3261:     Ccsr->values = new THRUSTARRAY(c->nz);
3262:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3263:     /* update matC with the new pointers */
3264:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3265:     PetscCallCUSPARSE(stat);

3267:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3268:     PetscCallCUSPARSE(stat);
3269:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3270:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3271:     PetscCallCUSPARSE(stat);
3272:     PetscCallCUDA(cudaFree(dBuffer3));
3273:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3274:     PetscCallCUSPARSE(stat);
3275:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3276:   }
3277:   #else
3278:   size_t bufSize2;
3279:   /* ask bufferSize bytes for external memory */
3280:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3281:   PetscCallCUSPARSE(stat);
3282:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3283:   /* inspect the matrices A and B to understand the memory requirement for the next step */
3284:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3285:   PetscCallCUSPARSE(stat);
3286:   /* ask bufferSize again bytes for external memory */
3287:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3288:   PetscCallCUSPARSE(stat);
3289:   /* The CUSPARSE documentation is not clear, nor the API
3290:      We need both buffers to perform the operations properly!
3291:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3292:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3293:      is stored in the descriptor! What a messy API... */
3294:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3295:   /* compute the intermediate product of A * B */
3296:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3297:   PetscCallCUSPARSE(stat);
3298:   /* get matrix C non-zero entries C_nnz1 */
3299:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3300:   c->nz = (PetscInt)C_nnz1;
3301:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3302:                       mmdata->mmBufferSize / 1024));
3303:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3304:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3305:   Ccsr->values = new THRUSTARRAY(c->nz);
3306:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3307:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3308:   PetscCallCUSPARSE(stat);
3309:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3310:   PetscCallCUSPARSE(stat);
3311:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3312: #else
3313:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3314:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3315:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3316:   PetscCallCUSPARSE(stat);
3317:   c->nz                = cnz;
3318:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3319:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3320:   Ccsr->values = new THRUSTARRAY(c->nz);
3321:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

3323:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3324:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3325:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3326:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3327:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3328:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3329:   PetscCallCUSPARSE(stat);
3330: #endif
3331:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3332:   PetscCall(PetscLogGpuTimeEnd());
3333: finalizesym:
3334:   c->free_a = PETSC_TRUE;
3335:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3336:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3337:   c->free_ij = PETSC_TRUE;
3338:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3339:     PetscInt      *d_i = c->i;
3340:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3341:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3342:     ii = *Ccsr->row_offsets;
3343:     jj = *Ccsr->column_indices;
3344:     if (ciscompressed) d_i = c->compressedrow.i;
3345:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3347:   } else {
3348:     PetscInt *d_i = c->i;
3349:     if (ciscompressed) d_i = c->compressedrow.i;
3350:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3352:   }
3353:   if (ciscompressed) { /* need to expand host row offsets */
3354:     PetscInt r = 0;
3355:     c->i[0]    = 0;
3356:     for (k = 0; k < c->compressedrow.nrows; k++) {
3357:       const PetscInt next = c->compressedrow.rindex[k];
3358:       const PetscInt old  = c->compressedrow.i[k];
3359:       for (; r < next; r++) c->i[r + 1] = old;
3360:     }
3361:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3362:   }
3363:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3364:   PetscCall(PetscMalloc1(m, &c->ilen));
3365:   PetscCall(PetscMalloc1(m, &c->imax));
3366:   c->maxnz         = c->nz;
3367:   c->nonzerorowcnt = 0;
3368:   c->rmax          = 0;
3369:   for (k = 0; k < m; k++) {
3370:     const PetscInt nn = c->i[k + 1] - c->i[k];
3371:     c->ilen[k] = c->imax[k] = nn;
3372:     c->nonzerorowcnt += (PetscInt)!!nn;
3373:     c->rmax = PetscMax(c->rmax, nn);
3374:   }
3375:   PetscCall(PetscMalloc1(c->nz, &c->a));
3376:   Ccsr->num_entries = c->nz;

3378:   C->nonzerostate++;
3379:   PetscCall(PetscLayoutSetUp(C->rmap));
3380:   PetscCall(PetscLayoutSetUp(C->cmap));
3381:   Ccusp->nonzerostate = C->nonzerostate;
3382:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3383:   C->preallocated     = PETSC_TRUE;
3384:   C->assembled        = PETSC_FALSE;
3385:   C->was_assembled    = PETSC_FALSE;
3386:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3387:     mmdata->reusesym = PETSC_TRUE;
3388:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3389:   }
3390:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3391:   PetscFunctionReturn(PETSC_SUCCESS);
3392: }

3394: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3396: /* handles sparse or dense B */
3397: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3398: {
3399:   Mat_Product *product = mat->product;
3400:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3402:   PetscFunctionBegin;
3403:   MatCheckProduct(mat, 1);
3404:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3405:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3406:   if (product->type == MATPRODUCT_ABC) {
3407:     Ciscusp = PETSC_FALSE;
3408:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3409:   }
3410:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3411:     PetscBool usecpu = PETSC_FALSE;
3412:     switch (product->type) {
3413:     case MATPRODUCT_AB:
3414:       if (product->api_user) {
3415:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3416:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3417:         PetscOptionsEnd();
3418:       } else {
3419:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3420:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3421:         PetscOptionsEnd();
3422:       }
3423:       break;
3424:     case MATPRODUCT_AtB:
3425:       if (product->api_user) {
3426:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3427:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3428:         PetscOptionsEnd();
3429:       } else {
3430:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3431:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3432:         PetscOptionsEnd();
3433:       }
3434:       break;
3435:     case MATPRODUCT_PtAP:
3436:       if (product->api_user) {
3437:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3438:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3439:         PetscOptionsEnd();
3440:       } else {
3441:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3442:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3443:         PetscOptionsEnd();
3444:       }
3445:       break;
3446:     case MATPRODUCT_RARt:
3447:       if (product->api_user) {
3448:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3449:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3450:         PetscOptionsEnd();
3451:       } else {
3452:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3453:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3454:         PetscOptionsEnd();
3455:       }
3456:       break;
3457:     case MATPRODUCT_ABC:
3458:       if (product->api_user) {
3459:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3460:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3461:         PetscOptionsEnd();
3462:       } else {
3463:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3464:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3465:         PetscOptionsEnd();
3466:       }
3467:       break;
3468:     default:
3469:       break;
3470:     }
3471:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3472:   }
3473:   /* dispatch */
3474:   if (isdense) {
3475:     switch (product->type) {
3476:     case MATPRODUCT_AB:
3477:     case MATPRODUCT_AtB:
3478:     case MATPRODUCT_ABt:
3479:     case MATPRODUCT_PtAP:
3480:     case MATPRODUCT_RARt:
3481:       if (product->A->boundtocpu) {
3482:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3483:       } else {
3484:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3485:       }
3486:       break;
3487:     case MATPRODUCT_ABC:
3488:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3489:       break;
3490:     default:
3491:       break;
3492:     }
3493:   } else if (Biscusp && Ciscusp) {
3494:     switch (product->type) {
3495:     case MATPRODUCT_AB:
3496:     case MATPRODUCT_AtB:
3497:     case MATPRODUCT_ABt:
3498:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3499:       break;
3500:     case MATPRODUCT_PtAP:
3501:     case MATPRODUCT_RARt:
3502:     case MATPRODUCT_ABC:
3503:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3504:       break;
3505:     default:
3506:       break;
3507:     }
3508:   } else { /* fallback for AIJ */
3509:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3510:   }
3511:   PetscFunctionReturn(PETSC_SUCCESS);
3512: }

3514: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515: {
3516:   PetscFunctionBegin;
3517:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3518:   PetscFunctionReturn(PETSC_SUCCESS);
3519: }

3521: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522: {
3523:   PetscFunctionBegin;
3524:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3525:   PetscFunctionReturn(PETSC_SUCCESS);
3526: }

3528: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529: {
3530:   PetscFunctionBegin;
3531:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3532:   PetscFunctionReturn(PETSC_SUCCESS);
3533: }

3535: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3536: {
3537:   PetscFunctionBegin;
3538:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3539:   PetscFunctionReturn(PETSC_SUCCESS);
3540: }

3542: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3543: {
3544:   PetscFunctionBegin;
3545:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3546:   PetscFunctionReturn(PETSC_SUCCESS);
3547: }

3549: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3550: {
3551:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3552:   if (i < n) y[idx[i]] += x[i];
3553: }

3555: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3556: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3557: {
3558:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3559:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3560:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3561:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3562:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3563:   PetscBool                     compressed;
3564: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3565:   PetscInt nx, ny;
3566: #endif

3568:   PetscFunctionBegin;
3569:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3570:   if (!a->nz) {
3571:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3572:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3573:     PetscFunctionReturn(PETSC_SUCCESS);
3574:   }
3575:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3576:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3577:   if (!trans) {
3578:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3580:   } else {
3581:     if (herm || !A->form_explicit_transpose) {
3582:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3583:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3584:     } else {
3585:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3586:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3587:     }
3588:   }
3589:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3590:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3592:   try {
3593:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3594:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3595:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3597:     PetscCall(PetscLogGpuTimeBegin());
3598:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3599:       /* z = A x + beta y.
3600:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3601:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3602:       */
3603:       xptr = xarray;
3604:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3605:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3606: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3607:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3608:           allocated to accommodate different uses. So we get the length info directly from mat.
3609:        */
3610:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3611:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3612:         nx             = mat->num_cols; // since y = Ax
3613:         ny             = mat->num_rows;
3614:       }
3615: #endif
3616:     } else {
3617:       /* z = A^T x + beta y
3618:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3619:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3620:        */
3621:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3622:       dptr = zarray;
3623:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3624:       if (compressed) { /* Scatter x to work vector */
3625:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3627:         thrust::for_each(
3628: #if PetscDefined(HAVE_THRUST_ASYNC)
3629:           thrust::cuda::par.on(PetscDefaultCudaStream),
3630: #endif
3631:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3632:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3633:       }
3634: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3636:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3637:         nx             = mat->num_rows; // since y = A^T x
3638:         ny             = mat->num_cols;
3639:       }
3640: #endif
3641:     }

3643:     /* csr_spmv does y = alpha op(A) x + beta y */
3644:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3645: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3646:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3647:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3648:   #else
3649:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3650:   #endif

3652:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3653:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3654:       if (!matDescr) {
3655:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3656:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3657:       }
3658:   #endif

3660:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3661:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3662:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3663:         PetscCallCUSPARSE(
3664:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3665:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3666:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3667:         PetscCallCUSPARSE(
3668:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3669:   #endif
3670:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3671:       } else {
3672:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3673:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3674:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3675:       }

3677:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3678: #else
3679:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3680:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3681: #endif
3682:     } else {
3683:       if (cusparsestruct->nrows) {
3684: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3685:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3686: #else
3687:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3688:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3689: #endif
3690:       }
3691:     }
3692:     PetscCall(PetscLogGpuTimeEnd());

3694:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3695:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3696:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3697:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3698:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3699:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3700:         }
3701:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3702:         PetscCall(VecSeq_CUDA::Set(zz, 0));
3703:       }

3705:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3706:       if (compressed) {
3707:         PetscCall(PetscLogGpuTimeBegin());
3708:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3709:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710:         PetscCall(PetscLogGpuTimeEnd());
3711:       }
3712:     } else {
3713:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3714:     }
3715:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3716:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3717:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3718:   } catch (char *ex) {
3719:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3720:   }
3721:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3722:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3723:   PetscFunctionReturn(PETSC_SUCCESS);
3724: }

3726: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3727: {
3728:   PetscFunctionBegin;
3729:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3730:   PetscFunctionReturn(PETSC_SUCCESS);
3731: }

3733: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);

3735: __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3736: {
3737:   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;

3739:   if (x < len) {
3740:     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3741:     PetscScalar    d = 0.0;

3743:     for (PetscInt i = 0; i < num_non0_row; i++) {
3744:       if (col[i + rowx] == x) {
3745:         d = val[i + rowx];
3746:         break;
3747:       }
3748:     }
3749:     diag[x] = d;
3750:   }
3751: }

3753: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3754: {
3755:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3756:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3757:   PetscScalar                  *darray;

3759:   PetscFunctionBegin;
3760:   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3761:     PetscInt   n   = A->rmap->n;
3762:     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;

3764:     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3765:     if (n > 0) {
3766:       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3767:       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3768:       PetscCallCUDA(cudaPeekAtLastError());
3769:       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3770:     }
3771:   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3772:   PetscFunctionReturn(PETSC_SUCCESS);
3773: }

3775: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3776: {
3777:   PetscFunctionBegin;
3778:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3779:   PetscFunctionReturn(PETSC_SUCCESS);
3780: }

3782: /*@
3783:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs

3785:   Collective

3787:   Input Parameters:
3788: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3789: . m    - number of rows
3790: . n    - number of columns
3791: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3792: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3794:   Output Parameter:
3795: . A - the matrix

3797:   Level: intermediate

3799:   Notes:
3800:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3801:   calculations. For good matrix assembly performance the user should preallocate the matrix
3802:   storage by setting the parameter `nz` (or the array `nnz`).

3804:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3805:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3806:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3808:   The AIJ format, also called
3809:   compressed row storage, is fully compatible with standard Fortran
3810:   storage.  That is, the stored row and column indices can begin at
3811:   either one (as in Fortran) or zero.

3813:   Specify the preallocated storage with either nz or nnz (not both).
3814:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3815:   allocation.

3817:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

3819: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3820:           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3821: @*/
3822: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3823: {
3824:   PetscFunctionBegin;
3825:   PetscCall(MatCreate(comm, A));
3826:   PetscCall(MatSetSizes(*A, m, n, m, n));
3827:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3828:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3829:   PetscFunctionReturn(PETSC_SUCCESS);
3830: }

3832: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3833: {
3834:   PetscFunctionBegin;
3835:   if (A->factortype == MAT_FACTOR_NONE) {
3836:     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3837:   } else {
3838:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3839:   }
3840:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3841:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3842:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3843:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3844:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3845:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3846:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3847:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3848:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3849:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3850:   PetscCall(MatDestroy_SeqAIJ(A));
3851:   PetscFunctionReturn(PETSC_SUCCESS);
3852: }

3854: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3855: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3856: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3857: {
3858:   PetscFunctionBegin;
3859:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3860:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3861:   PetscFunctionReturn(PETSC_SUCCESS);
3862: }

3864: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3865: {
3866:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3867:   Mat_SeqAIJCUSPARSE *cy;
3868:   Mat_SeqAIJCUSPARSE *cx;
3869:   PetscScalar        *ay;
3870:   const PetscScalar  *ax;
3871:   CsrMatrix          *csry, *csrx;

3873:   PetscFunctionBegin;
3874:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3875:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3876:   if (X->ops->axpy != Y->ops->axpy) {
3877:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3878:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3879:     PetscFunctionReturn(PETSC_SUCCESS);
3880:   }
3881:   /* if we are here, it means both matrices are bound to GPU */
3882:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3883:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3884:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3885:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3886:   csry = (CsrMatrix *)cy->mat->mat;
3887:   csrx = (CsrMatrix *)cx->mat->mat;
3888:   /* see if we can turn this into a cublas axpy */
3889:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3890:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3891:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3892:     if (eq) str = SAME_NONZERO_PATTERN;
3893:   }
3894:   /* spgeam is buggy with one column */
3895:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3897:   if (str == SUBSET_NONZERO_PATTERN) {
3898:     PetscScalar b = 1.0;
3899: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900:     size_t bufferSize;
3901:     void  *buffer;
3902: #endif

3904:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3905:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3906:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3907: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3908:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3909:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3910:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3911:     PetscCall(PetscLogGpuTimeBegin());
3912:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3913:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3914:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3915:     PetscCall(PetscLogGpuTimeEnd());
3916:     PetscCallCUDA(cudaFree(buffer));
3917: #else
3918:     PetscCall(PetscLogGpuTimeBegin());
3919:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3920:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3921:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3922:     PetscCall(PetscLogGpuTimeEnd());
3923: #endif
3924:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3925:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3926:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3927:   } else if (str == SAME_NONZERO_PATTERN) {
3928:     cublasHandle_t cublasv2handle;
3929:     PetscBLASInt   one = 1, bnz = 1;

3931:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3932:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3933:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3934:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3935:     PetscCall(PetscLogGpuTimeBegin());
3936:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3937:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3938:     PetscCall(PetscLogGpuTimeEnd());
3939:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3940:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3941:   } else {
3942:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3943:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3944:   }
3945:   PetscFunctionReturn(PETSC_SUCCESS);
3946: }

3948: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3949: {
3950:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3951:   PetscScalar   *ay;
3952:   cublasHandle_t cublasv2handle;
3953:   PetscBLASInt   one = 1, bnz = 1;

3955:   PetscFunctionBegin;
3956:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3957:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3958:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3959:   PetscCall(PetscLogGpuTimeBegin());
3960:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3961:   PetscCall(PetscLogGpuFlops(bnz));
3962:   PetscCall(PetscLogGpuTimeEnd());
3963:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3964:   PetscFunctionReturn(PETSC_SUCCESS);
3965: }

3967: struct DiagonalScaleLeft {
3968:   const PetscScalar       *lv_ptr;
3969:   PetscScalar             *val_ptr;
3970:   const int               *row_ptr;
3971:   const PetscInt          *cprow_ptr;
3972:   __host__ __device__ void operator()(int i) const
3973:   {
3974:     const int         row = cprow_ptr ? (int)cprow_ptr[i] : i;
3975:     const PetscScalar s   = lv_ptr[row];
3976:     for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) val_ptr[j] *= s;
3977:   }
3978: };

3980: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec l, Vec r)
3981: {
3982:   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)A->data;
3983:   CsrMatrix         *csr;
3984:   const PetscScalar *v;
3985:   PetscScalar       *av;
3986:   PetscInt           m, n;

3988:   PetscFunctionBegin;
3989:   PetscCall(PetscLogGpuTimeBegin());
3990:   PetscCall(MatSeqAIJCUSPARSEGetArray(A, &av));
3991:   csr = (CsrMatrix *)((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->mat;
3992:   if (l) {
3993:     const PetscInt   *cprow = ((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->cprowIndices ? ((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->cprowIndices->data().get() : NULL;
3994:     DiagonalScaleLeft functor;

3996:     PetscCall(VecGetLocalSize(l, &m));
3997:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling Vec of wrong length");
3998:     PetscCall(VecCUDAGetArrayRead(l, &v));
3999:     functor = {v, av, csr->row_offsets->data().get(), cprow};
4000:     PetscCallThrust(thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(csr->num_rows), functor));
4001:     PetscCall(VecCUDARestoreArrayRead(l, &v));
4002:     PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
4003:   }
4004:   PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &av));
4005:   if (r) {
4006:     PetscCall(VecGetLocalSize(r, &n));
4007:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling Vec of wrong length");
4008:     PetscCall(VecCUDAGetArrayRead(r, &v));
4009: #if CCCL_VERSION >= 3001000
4010:     PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(v), csr->column_indices->begin()), csr->values->begin(), cuda::std::multiplies<PetscScalar>()));
4011: #else
4012:     PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(v), csr->column_indices->begin()), csr->values->begin(), thrust::multiplies<PetscScalar>()));
4013: #endif
4014:     PetscCall(VecCUDARestoreArrayRead(r, &v));
4015:     PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
4016:   }
4017:   PetscCall(PetscLogGpuTimeEnd());
4018:   PetscFunctionReturn(PETSC_SUCCESS);
4019: }

4021: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4022: {
4023:   PetscBool   gpu = PETSC_FALSE;
4024:   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;

4026:   PetscFunctionBegin;
4027:   if (A->factortype == MAT_FACTOR_NONE) {
4028:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
4029:     if (spptr->mat) {
4030:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
4031:       if (matrix->values) {
4032:         gpu = PETSC_TRUE;
4033:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4034:       }
4035:     }
4036:     if (spptr->matTranspose) {
4037:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
4038:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4039:     }
4040:   }
4041:   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
4042:   else {
4043:     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
4044:     A->offloadmask = PETSC_OFFLOAD_CPU;
4045:   }
4046:   PetscFunctionReturn(PETSC_SUCCESS);
4047: }

4049: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
4050: {
4051:   PetscFunctionBegin;
4052:   *m = PETSC_MEMTYPE_CUDA;
4053:   PetscFunctionReturn(PETSC_SUCCESS);
4054: }

4056: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4057: {
4058:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4060:   PetscFunctionBegin;
4061:   if (A->factortype != MAT_FACTOR_NONE) {
4062:     A->boundtocpu = flg;
4063:     PetscFunctionReturn(PETSC_SUCCESS);
4064:   }
4065:   if (flg) {
4066:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

4068:     A->ops->scale                     = MatScale_SeqAIJ;
4069:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJ;
4070:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4071:     A->ops->axpy                      = MatAXPY_SeqAIJ;
4072:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4073:     A->ops->mult                      = MatMult_SeqAIJ;
4074:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4075:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4076:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4077:     A->ops->multhermitiantranspose    = NULL;
4078:     A->ops->multhermitiantransposeadd = NULL;
4079:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4080:     A->ops->getcurrentmemtype         = NULL;
4081:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4082:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4083:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4084:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4085:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4086:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4087:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4088:   } else {
4089:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4090:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJCUSPARSE;
4091:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4092:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4093:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4094:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4095:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4096:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4097:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4098:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4099:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4100:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4101:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4102:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4103:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4104:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4105:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4106:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4107:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4108:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

4110:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4111:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4112:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4113:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4114:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4115:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4116:   }
4117:   A->boundtocpu = flg;
4118:   if (flg && a->inode.size_csr) {
4119:     a->inode.use = PETSC_TRUE;
4120:   } else {
4121:     a->inode.use = PETSC_FALSE;
4122:   }
4123:   PetscFunctionReturn(PETSC_SUCCESS);
4124: }

4126: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4127: {
4128:   Mat B;

4130:   PetscFunctionBegin;
4131:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4132:   if (reuse == MAT_INITIAL_MATRIX) {
4133:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4134:   } else if (reuse == MAT_REUSE_MATRIX) {
4135:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4136:   }
4137:   B = *newmat;

4139:   PetscCall(PetscFree(B->defaultvectype));
4140:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

4142:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4143:     if (B->factortype == MAT_FACTOR_NONE) {
4144:       Mat_SeqAIJCUSPARSE *spptr;
4145:       PetscCall(PetscNew(&spptr));
4146:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4147:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4148:       spptr->format = MAT_CUSPARSE_CSR;
4149: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4150:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4151:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4152:   #else
4153:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4154:   #endif
4155:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4156:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4157: #endif
4158:       B->spptr = spptr;
4159:     } else {
4160:       Mat_SeqAIJCUSPARSETriFactors *spptr;

4162:       PetscCall(PetscNew(&spptr));
4163:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4164:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4165:       B->spptr = spptr;
4166:     }
4167:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4168:   }
4169:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4170:   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4171:   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4172:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4173:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4174:   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4175:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;

4177:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4178:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4179:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4180: #if defined(PETSC_HAVE_HYPRE)
4181:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4182: #endif
4183:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4184:   PetscFunctionReturn(PETSC_SUCCESS);
4185: }

4187: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4188: {
4189:   PetscFunctionBegin;
4190:   PetscCall(MatCreate_SeqAIJ(B));
4191:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4192:   PetscFunctionReturn(PETSC_SUCCESS);
4193: }

4195: /*MC
4196:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

4198:    Options Database Keys:
4199: +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4200: .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4201:                                            Other options include ell (ellpack) or hyb (hybrid).
4202: .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4203: -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU

4205:   Level: beginner

4207:   Notes:
4208:   These matrices can be in either CSR, ELL, or HYB format.

4210:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

4212:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4213:   if some integer values passed in do not fit in `int`.

4215: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4216: M*/

4218: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4219: {
4220:   PetscFunctionBegin;
4221:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4222:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4223:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4224:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4225:   PetscFunctionReturn(PETSC_SUCCESS);
4226: }

4228: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4229: {
4230:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

4232:   PetscFunctionBegin;
4233:   if (cusp) {
4234:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4235:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4236:     delete cusp->workVector;
4237:     delete cusp->rowoffsets_gpu;
4238:     delete cusp->csr2csc_i;
4239:     delete cusp->coords;
4240:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4241:     PetscCall(PetscFree(mat->spptr));
4242:   }
4243:   PetscFunctionReturn(PETSC_SUCCESS);
4244: }

4246: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4247: {
4248:   PetscFunctionBegin;
4249:   if (*mat) {
4250:     delete (*mat)->values;
4251:     delete (*mat)->column_indices;
4252:     delete (*mat)->row_offsets;
4253:     delete *mat;
4254:     *mat = 0;
4255:   }
4256:   PetscFunctionReturn(PETSC_SUCCESS);
4257: }

4259: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4260: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4261: {
4262:   PetscFunctionBegin;
4263:   if (*trifactor) {
4264:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4265:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4266:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4267:     PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4268:     PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4269:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4270:     PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4271:   #endif
4272:     PetscCall(PetscFree(*trifactor));
4273:   }
4274:   PetscFunctionReturn(PETSC_SUCCESS);
4275: }
4276: #endif

4278: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4279: {
4280:   CsrMatrix *mat;

4282:   PetscFunctionBegin;
4283:   if (*matstruct) {
4284:     if ((*matstruct)->mat) {
4285:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4286: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4287:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4288: #else
4289:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4290:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4291: #endif
4292:       } else {
4293:         mat = (CsrMatrix *)(*matstruct)->mat;
4294:         PetscCall(CsrMatrix_Destroy(&mat));
4295:       }
4296:     }
4297:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4298:     delete (*matstruct)->cprowIndices;
4299:     PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4300:     PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4301:     PetscCallCUDA(cudaFree((*matstruct)->beta_one));

4303: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4304:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4305:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

4307:     for (int i = 0; i < 3; i++) {
4308:       if (mdata->cuSpMV[i].initialized) {
4309:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4310:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4311:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4312:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4313:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4314:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4315:   #endif
4316:       }
4317:     }
4318: #endif
4319:     delete *matstruct;
4320:     *matstruct = NULL;
4321:   }
4322:   PetscFunctionReturn(PETSC_SUCCESS);
4323: }

4325: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4326: {
4327:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

4329:   PetscFunctionBegin;
4330:   if (fs) {
4331: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4332:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4333:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4334:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4335:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4336:     delete fs->workVector;
4337:     fs->workVector = NULL;
4338: #endif
4339:     delete fs->rpermIndices;
4340:     delete fs->cpermIndices;
4341:     fs->rpermIndices  = NULL;
4342:     fs->cpermIndices  = NULL;
4343:     fs->init_dev_prop = PETSC_FALSE;
4344: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4345:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4346:     PetscCallCUDA(cudaFree(fs->csrColIdx));
4347:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4348:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4349:     PetscCallCUDA(cudaFree(fs->csrVal));
4350:     PetscCallCUDA(cudaFree(fs->diag));
4351:     PetscCallCUDA(cudaFree(fs->X));
4352:     PetscCallCUDA(cudaFree(fs->Y));
4353:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4354:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4355:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4356:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4357:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4358:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4359:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4360:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4361:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4362:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4363:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4364:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4365:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4366:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4367:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4368:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4369:     PetscCall(PetscFree(fs->csrRowPtr_h));
4370:     PetscCall(PetscFree(fs->csrVal_h));
4371:     PetscCall(PetscFree(fs->diag_h));
4372:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4373:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4374: #endif
4375:   }
4376:   PetscFunctionReturn(PETSC_SUCCESS);
4377: }

4379: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4380: {
4381:   PetscFunctionBegin;
4382:   if (*trifactors) {
4383:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4384:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4385:     PetscCall(PetscFree(*trifactors));
4386:   }
4387:   PetscFunctionReturn(PETSC_SUCCESS);
4388: }

4390: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4391: {
4392:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4394:   PetscFunctionBegin;
4395:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4396:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4397:   if (destroy) {
4398:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4399:     delete cusp->csr2csc_i;
4400:     cusp->csr2csc_i = NULL;
4401:   }
4402:   A->transupdated = PETSC_FALSE;
4403:   PetscFunctionReturn(PETSC_SUCCESS);
4404: }

4406: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)
4407: {
4408:   MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;

4410:   PetscFunctionBegin;
4411:   PetscCallCUDA(cudaFree(coo->perm));
4412:   PetscCallCUDA(cudaFree(coo->jmap));
4413:   PetscCall(PetscFree(coo));
4414:   PetscFunctionReturn(PETSC_SUCCESS);
4415: }

4417: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4418: {
4419:   PetscBool            dev_ij = PETSC_FALSE;
4420:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4421:   PetscInt            *i, *j;
4422:   PetscContainer       container_h;
4423:   MatCOOStruct_SeqAIJ *coo_h, *coo_d;

4425:   PetscFunctionBegin;
4426:   PetscCall(PetscGetMemType(coo_i, &mtype));
4427:   if (PetscMemTypeDevice(mtype)) {
4428:     dev_ij = PETSC_TRUE;
4429:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4430:     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4431:     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4432:   } else {
4433:     i = coo_i;
4434:     j = coo_j;
4435:   }

4437:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4438:   if (dev_ij) PetscCall(PetscFree2(i, j));
4439:   mat->offloadmask = PETSC_OFFLOAD_CPU;
4440:   // Create the GPU memory
4441:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));

4443:   // Copy the COO struct to device
4444:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4445:   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
4446:   PetscCall(PetscMalloc1(1, &coo_d));
4447:   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4448:   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4449:   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4450:   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4451:   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));

4453:   // Put the COO struct in a container and then attach that to the matrix
4454:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4455:   PetscFunctionReturn(PETSC_SUCCESS);
4456: }

4458: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4459: {
4460:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4461:   const PetscCount grid_size = gridDim.x * blockDim.x;
4462:   for (; i < nnz; i += grid_size) {
4463:     PetscScalar sum = 0.0;
4464:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4465:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4466:   }
4467: }

4469: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4470: {
4471:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4472:   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4473:   PetscCount           Annz = seq->nz;
4474:   PetscMemType         memtype;
4475:   const PetscScalar   *v1 = v;
4476:   PetscScalar         *Aa;
4477:   PetscContainer       container;
4478:   MatCOOStruct_SeqAIJ *coo;

4480:   PetscFunctionBegin;
4481:   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));

4483:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4484:   PetscCall(PetscContainerGetPointer(container, &coo));

4486:   PetscCall(PetscGetMemType(v, &memtype));
4487:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4488:     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4489:     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4490:   }

4492:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4493:   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4495:   PetscCall(PetscLogGpuTimeBegin());
4496:   if (Annz) {
4497:     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4498:     PetscCallCUDA(cudaPeekAtLastError());
4499:   }
4500:   PetscCall(PetscLogGpuTimeEnd());

4502:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4503:   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4505:   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4506:   PetscFunctionReturn(PETSC_SUCCESS);
4507: }

4509: /*@C
4510:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4512:   Not Collective

4514:   Input Parameters:
4515: + A          - the matrix
4516: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4518:   Output Parameters:
4519: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4520: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`

4522:   Level: developer

4524:   Note:
4525:   When compressed is true, the CSR structure does not contain empty rows

4527: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4528: @*/
4529: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4530: {
4531:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4532:   CsrMatrix          *csr;
4533:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4535:   PetscFunctionBegin;
4537:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4538:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4539:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4540:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4541:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4542:   csr = (CsrMatrix *)cusp->mat->mat;
4543:   if (i) {
4544:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4545:       if (!cusp->rowoffsets_gpu) {
4546:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4547:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4548:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4549:       }
4550:       *i = cusp->rowoffsets_gpu->data().get();
4551:     } else *i = csr->row_offsets->data().get();
4552:   }
4553:   if (j) *j = csr->column_indices->data().get();
4554:   PetscFunctionReturn(PETSC_SUCCESS);
4555: }

4557: /*@C
4558:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4560:   Not Collective

4562:   Input Parameters:
4563: + A          - the matrix
4564: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4565: . i          - the CSR row pointers
4566: - j          - the CSR column indices

4568:   Level: developer

4570: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4571: @*/
4572: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4573: {
4574:   PetscFunctionBegin;
4576:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4577:   if (i) *i = NULL;
4578:   if (j) *j = NULL;
4579:   (void)compressed;
4580:   PetscFunctionReturn(PETSC_SUCCESS);
4581: }

4583: /*@C
4584:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored

4586:   Not Collective

4588:   Input Parameter:
4589: . A - a `MATSEQAIJCUSPARSE` matrix

4591:   Output Parameter:
4592: . a - pointer to the device data

4594:   Level: developer

4596:   Note:
4597:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

4599: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4600: @*/
4601: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4602: {
4603:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4604:   CsrMatrix          *csr;

4606:   PetscFunctionBegin;
4608:   PetscAssertPointer(a, 2);
4609:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4610:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4611:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4612:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4613:   csr = (CsrMatrix *)cusp->mat->mat;
4614:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4615:   *a = csr->values->data().get();
4616:   PetscFunctionReturn(PETSC_SUCCESS);
4617: }

4619: /*@C
4620:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4622:   Not Collective

4624:   Input Parameters:
4625: + A - a `MATSEQAIJCUSPARSE` matrix
4626: - a - pointer to the device data

4628:   Level: developer

4630: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4631: @*/
4632: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4633: {
4634:   PetscFunctionBegin;
4636:   PetscAssertPointer(a, 2);
4637:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4638:   *a = NULL;
4639:   PetscFunctionReturn(PETSC_SUCCESS);
4640: }

4642: /*@C
4643:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4645:   Not Collective

4647:   Input Parameter:
4648: . A - a `MATSEQAIJCUSPARSE` matrix

4650:   Output Parameter:
4651: . a - pointer to the device data

4653:   Level: developer

4655:   Note:
4656:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

4658: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4659: @*/
4660: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4661: {
4662:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4663:   CsrMatrix          *csr;

4665:   PetscFunctionBegin;
4667:   PetscAssertPointer(a, 2);
4668:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4669:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4670:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4671:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4672:   csr = (CsrMatrix *)cusp->mat->mat;
4673:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4674:   *a             = csr->values->data().get();
4675:   A->offloadmask = PETSC_OFFLOAD_GPU;
4676:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4677:   PetscFunctionReturn(PETSC_SUCCESS);
4678: }
4679: /*@C
4680:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4682:   Not Collective

4684:   Input Parameters:
4685: + A - a `MATSEQAIJCUSPARSE` matrix
4686: - a - pointer to the device data

4688:   Level: developer

4690: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4691: @*/
4692: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4693: {
4694:   PetscFunctionBegin;
4696:   PetscAssertPointer(a, 2);
4697:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4698:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4699:   *a = NULL;
4700:   PetscFunctionReturn(PETSC_SUCCESS);
4701: }

4703: /*@C
4704:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4706:   Not Collective

4708:   Input Parameter:
4709: . A - a `MATSEQAIJCUSPARSE` matrix

4711:   Output Parameter:
4712: . a - pointer to the device data

4714:   Level: developer

4716:   Note:
4717:   Does not trigger any host to device copies.

4719:   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current

4721: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4722: @*/
4723: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4724: {
4725:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4726:   CsrMatrix          *csr;

4728:   PetscFunctionBegin;
4730:   PetscAssertPointer(a, 2);
4731:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4732:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4734:   csr = (CsrMatrix *)cusp->mat->mat;
4735:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4736:   *a             = csr->values->data().get();
4737:   A->offloadmask = PETSC_OFFLOAD_GPU;
4738:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4739:   PetscFunctionReturn(PETSC_SUCCESS);
4740: }

4742: /*@C
4743:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4745:   Not Collective

4747:   Input Parameters:
4748: + A - a `MATSEQAIJCUSPARSE` matrix
4749: - a - pointer to the device data

4751:   Level: developer

4753: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4754: @*/
4755: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4756: {
4757:   PetscFunctionBegin;
4759:   PetscAssertPointer(a, 2);
4760:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4761:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4762:   *a = NULL;
4763:   PetscFunctionReturn(PETSC_SUCCESS);
4764: }

4766: struct IJCompare4 {
4767:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4768:   {
4769:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4770:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4771:     return false;
4772:   }
4773: };

4775: struct Shift {
4776:   int _shift;

4778:   Shift(int shift) : _shift(shift) { }
4779:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4780: };

4782: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4783: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4784: {
4785:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4786:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4787:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4788:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4789:   PetscInt                      Annz, Bnnz;
4790:   cusparseStatus_t              stat;
4791:   PetscInt                      i, m, n, zero = 0;

4793:   PetscFunctionBegin;
4796:   PetscAssertPointer(C, 4);
4797:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4798:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4799:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4800:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4801:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4802:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4803:   if (reuse == MAT_INITIAL_MATRIX) {
4804:     m = A->rmap->n;
4805:     n = A->cmap->n + B->cmap->n;
4806:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4807:     PetscCall(MatSetSizes(*C, m, n, m, n));
4808:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4809:     c                       = (Mat_SeqAIJ *)(*C)->data;
4810:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4811:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4812:     Ccsr                    = new CsrMatrix;
4813:     Cmat->cprowIndices      = NULL;
4814:     c->compressedrow.use    = PETSC_FALSE;
4815:     c->compressedrow.nrows  = 0;
4816:     c->compressedrow.i      = NULL;
4817:     c->compressedrow.rindex = NULL;
4818:     Ccusp->workVector       = NULL;
4819:     Ccusp->nrows            = m;
4820:     Ccusp->mat              = Cmat;
4821:     Ccusp->mat->mat         = Ccsr;
4822:     Ccsr->num_rows          = m;
4823:     Ccsr->num_cols          = n;
4824:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4825:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4826:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4827:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4828:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4829:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4830:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4831:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4832:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4833:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4834:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4835:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4836:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4838:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4839:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4840:     Annz                 = (PetscInt)Acsr->column_indices->size();
4841:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4842:     c->nz                = Annz + Bnnz;
4843:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4844:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4845:     Ccsr->values         = new THRUSTARRAY(c->nz);
4846:     Ccsr->num_entries    = c->nz;
4847:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4848:     if (c->nz) {
4849:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4850:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4851:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4852:       THRUSTINTARRAY32 *Aroff, *Broff;

4854:       if (a->compressedrow.use) { /* need full row offset */
4855:         if (!Acusp->rowoffsets_gpu) {
4856:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4857:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4858:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4859:         }
4860:         Aroff = Acusp->rowoffsets_gpu;
4861:       } else Aroff = Acsr->row_offsets;
4862:       if (b->compressedrow.use) { /* need full row offset */
4863:         if (!Bcusp->rowoffsets_gpu) {
4864:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4865:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4866:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4867:         }
4868:         Broff = Bcusp->rowoffsets_gpu;
4869:       } else Broff = Bcsr->row_offsets;
4870:       PetscCall(PetscLogGpuTimeBegin());
4871:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4872:       PetscCallCUSPARSE(stat);
4873:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4874:       PetscCallCUSPARSE(stat);
4875:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4876:       auto Aperm = thrust::make_constant_iterator(1);
4877:       auto Bperm = thrust::make_constant_iterator(0);
4878: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4879:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4880:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4881: #else
4882:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4883:       auto Bcib = Bcsr->column_indices->begin();
4884:       auto Bcie = Bcsr->column_indices->end();
4885:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4886: #endif
4887:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4888:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4889:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4890:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4891:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4892:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4893:       auto p1    = Ccusp->coords->begin();
4894:       auto p2    = Ccusp->coords->begin();
4895: #if CCCL_VERSION >= 3001000
4896:       cuda::std::advance(p2, Annz);
4897: #else
4898:       thrust::advance(p2, Annz);
4899: #endif
4900:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4901: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4902:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4903: #endif
4904:       auto cci = thrust::make_counting_iterator(zero);
4905:       auto cce = thrust::make_counting_iterator(c->nz);
4906: #if 0 //Errors on SUMMIT cuda 11.1.0
4907:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4908: #else
4909:   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4910:       auto pred = thrust::identity<int>();
4911:   #else
4912:       auto pred = cuda::std::identity();
4913:   #endif
4914:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4915:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4916: #endif
4917:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4918:       PetscCallCUSPARSE(stat);
4919:       PetscCall(PetscLogGpuTimeEnd());
4920:       delete wPerm;
4921:       delete Acoo;
4922:       delete Bcoo;
4923:       delete Ccoo;
4924: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4925:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4926:       PetscCallCUSPARSE(stat);
4927: #endif
4928:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4929:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4930:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4931:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4932:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4933:         CsrMatrix                    *CcsrT = new CsrMatrix;
4934:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4935:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4937:         (*C)->form_explicit_transpose = PETSC_TRUE;
4938:         (*C)->transupdated            = PETSC_TRUE;
4939:         Ccusp->rowoffsets_gpu         = NULL;
4940:         CmatT->cprowIndices           = NULL;
4941:         CmatT->mat                    = CcsrT;
4942:         CcsrT->num_rows               = n;
4943:         CcsrT->num_cols               = m;
4944:         CcsrT->num_entries            = c->nz;

4946:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4947:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4948:         CcsrT->values         = new THRUSTARRAY(c->nz);

4950:         PetscCall(PetscLogGpuTimeBegin());
4951:         auto rT = CcsrT->row_offsets->begin();
4952:         if (AT) {
4953:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4954: #if CCCL_VERSION >= 3001000
4955:           cuda::std::advance(rT, -1);
4956: #else
4957:           thrust::advance(rT, -1);
4958: #endif
4959:         }
4960:         if (BT) {
4961:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4962:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4963:           thrust::copy(titb, tite, rT);
4964:         }
4965:         auto cT = CcsrT->column_indices->begin();
4966:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4967:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4968:         auto vT = CcsrT->values->begin();
4969:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4970:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4971:         PetscCall(PetscLogGpuTimeEnd());

4973:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4974:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4975:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4976:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4977:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4978:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4979:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4980:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4981:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4982: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4983:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4984:         PetscCallCUSPARSE(stat);
4985: #endif
4986:         Ccusp->matTranspose = CmatT;
4987:       }
4988:     }

4990:     c->free_a = PETSC_TRUE;
4991:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4992:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4993:     c->free_ij = PETSC_TRUE;
4994:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4995:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4996:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4997:       ii = *Ccsr->row_offsets;
4998:       jj = *Ccsr->column_indices;
4999:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5000:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5001:     } else {
5002:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5003:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5004:     }
5005:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5006:     PetscCall(PetscMalloc1(m, &c->ilen));
5007:     PetscCall(PetscMalloc1(m, &c->imax));
5008:     c->maxnz         = c->nz;
5009:     c->nonzerorowcnt = 0;
5010:     c->rmax          = 0;
5011:     for (i = 0; i < m; i++) {
5012:       const PetscInt nn = c->i[i + 1] - c->i[i];
5013:       c->ilen[i] = c->imax[i] = nn;
5014:       c->nonzerorowcnt += (PetscInt)!!nn;
5015:       c->rmax = PetscMax(c->rmax, nn);
5016:     }
5017:     PetscCall(PetscMalloc1(c->nz, &c->a));
5018:     (*C)->nonzerostate++;
5019:     PetscCall(PetscLayoutSetUp((*C)->rmap));
5020:     PetscCall(PetscLayoutSetUp((*C)->cmap));
5021:     Ccusp->nonzerostate = (*C)->nonzerostate;
5022:     (*C)->preallocated  = PETSC_TRUE;
5023:   } else {
5024:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5025:     c = (Mat_SeqAIJ *)(*C)->data;
5026:     if (c->nz) {
5027:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5028:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
5029:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5030:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5031:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5032:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5033:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5034:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5035:       Acsr = (CsrMatrix *)Acusp->mat->mat;
5036:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5037:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5038:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5039:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5040:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5041:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5042:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
5043:       auto pmid = Ccusp->coords->begin();
5044: #if CCCL_VERSION >= 3001000
5045:       cuda::std::advance(pmid, Acsr->num_entries);
5046: #else
5047:       thrust::advance(pmid, Acsr->num_entries);
5048: #endif
5049:       PetscCall(PetscLogGpuTimeBegin());
5050:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
5051:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5052:       thrust::for_each(zibait, zieait, VecCUDAEquals());
5053:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5054:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5055:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5056:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5057:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5058:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5059:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5060:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5061:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5062:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5063:         auto       vT    = CcsrT->values->begin();
5064:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5065:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5066:         (*C)->transupdated = PETSC_TRUE;
5067:       }
5068:       PetscCall(PetscLogGpuTimeEnd());
5069:     }
5070:   }
5071:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5072:   (*C)->assembled     = PETSC_TRUE;
5073:   (*C)->was_assembled = PETSC_FALSE;
5074:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5075:   PetscFunctionReturn(PETSC_SUCCESS);
5076: }

5078: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5079: {
5080:   bool               dmem;
5081:   const PetscScalar *av;

5083:   PetscFunctionBegin;
5084:   dmem = isCudaMem(v);
5085:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5086:   if (n && idx) {
5087:     THRUSTINTARRAY widx(n);
5088:     widx.assign(idx, idx + n);
5089:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

5091:     THRUSTARRAY                    *w = NULL;
5092:     thrust::device_ptr<PetscScalar> dv;
5093:     if (dmem) {
5094:       dv = thrust::device_pointer_cast(v);
5095:     } else {
5096:       w  = new THRUSTARRAY(n);
5097:       dv = w->data();
5098:     }
5099:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

5101:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5102:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5103:     thrust::for_each(zibit, zieit, VecCUDAEquals());
5104:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5105:     delete w;
5106:   } else {
5107:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5108:   }
5109:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5110:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5111:   PetscFunctionReturn(PETSC_SUCCESS);
5112: }