Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
 15: #include <thrust/adjacent_difference.h>
 16: #if PETSC_CPP_VERSION >= 14
 17:   #define PETSC_HAVE_THRUST_ASYNC 1
 18: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 19: #endif
 20: #include <thrust/iterator/constant_iterator.h>
 21: #include <thrust/remove.h>
 22: #include <thrust/sort.h>
 23: #include <thrust/tuple.h>
 24: #include <thrust/unique.h>
 25: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0)
 26:   #include <cuda/std/functional>
 27: #endif

 29: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 30: /*
 31:   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 32:   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
 33: */
 34: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 35: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 36: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};

 38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 42: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 43: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 44: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 45: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
 46: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 47: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 48: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 49: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 52: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 54: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 55: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 56: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 57: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 59: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 60: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 62: static PetscErrorCode       MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 63: static PetscErrorCode       MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 64: static PetscErrorCode       MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
 65: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);

 67: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 68: {
 69:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

 71:   PetscFunctionBegin;
 72:   switch (op) {
 73:   case MAT_CUSPARSE_MULT:
 74:     cusparsestruct->format = format;
 75:     break;
 76:   case MAT_CUSPARSE_ALL:
 77:     cusparsestruct->format = format;
 78:     break;
 79:   default:
 80:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
 81:   }
 82:   PetscFunctionReturn(PETSC_SUCCESS);
 83: }

 85: /*@
 86:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
 87:   operation. Only the `MatMult()` operation can use different GPU storage formats

 89:   Not Collective

 91:   Input Parameters:
 92: + A      - Matrix of type `MATSEQAIJCUSPARSE`
 93: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
 94:            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
 95: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

 97:   Level: intermediate

 99: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
100: @*/
101: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
102: {
103:   PetscFunctionBegin;
105:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
106:   PetscFunctionReturn(PETSC_SUCCESS);
107: }

109: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
110: {
111:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

113:   PetscFunctionBegin;
114:   cusparsestruct->use_cpu_solve = use_cpu;
115:   PetscFunctionReturn(PETSC_SUCCESS);
116: }

118: /*@
119:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

121:   Input Parameters:
122: + A       - Matrix of type `MATSEQAIJCUSPARSE`
123: - use_cpu - set flag for using the built-in CPU `MatSolve()`

125:   Level: intermediate

127:   Note:
128:   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
129:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
130:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

132: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133: @*/
134: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
135: {
136:   PetscFunctionBegin;
138:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
139:   PetscFunctionReturn(PETSC_SUCCESS);
140: }

142: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
143: {
144:   PetscFunctionBegin;
145:   switch (op) {
146:   case MAT_FORM_EXPLICIT_TRANSPOSE:
147:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
148:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
149:     A->form_explicit_transpose = flg;
150:     break;
151:   default:
152:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
153:     break;
154:   }
155:   PetscFunctionReturn(PETSC_SUCCESS);
156: }

158: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
159: {
160:   MatCUSPARSEStorageFormat format;
161:   PetscBool                flg;
162:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

164:   PetscFunctionBegin;
165:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
166:   if (A->factortype == MAT_FACTOR_NONE) {
167:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
168:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

170:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
171:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
172:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
173:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
174:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
175:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
176:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
177:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
178:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
179:     PetscCall(
180:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
181:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
182:   }
183:   PetscOptionsHeadEnd();
184:   PetscFunctionReturn(PETSC_SUCCESS);
185: }

187: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
188: {
189:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
190:   PetscInt                      m  = A->rmap->n;
191:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
192:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
193:   const MatScalar              *Aa = a->a;
194:   PetscInt                     *Mi, *Mj, Mnz;
195:   PetscScalar                  *Ma;

197:   PetscFunctionBegin;
198:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
199:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
200:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
201:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
202:       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
203:       PetscCall(PetscMalloc1(m + 1, &Mi));
204:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
205:       PetscCall(PetscMalloc1(Mnz, &Ma));
206:       Mi[0] = 0;
207:       for (PetscInt i = 0; i < m; i++) {
208:         PetscInt llen = Ai[i + 1] - Ai[i];
209:         PetscInt ulen = adiag[i] - adiag[i + 1];
210:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
211:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
212:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
213:         Mi[i + 1] = Mi[i] + llen + ulen;
214:       }
215:       // Copy M (L,U) from host to device
216:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
217:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
218:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
219:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
220:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

222:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
223:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
224:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
225:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
226:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
227:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
228:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
229:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

231:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
232:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
233:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

235:       fillMode = CUSPARSE_FILL_MODE_UPPER;
236:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
237:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
238:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
239:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

241:       // Allocate work vectors in SpSv
242:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
243:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

245:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
246:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

248:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
249:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
250:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
251:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
252:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
253:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
254:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

256:       // Record for reuse
257:       fs->csrRowPtr_h = Mi;
258:       fs->csrVal_h    = Ma;
259:       PetscCall(PetscFree(Mj));
260:     }
261:     // Copy the value
262:     Mi  = fs->csrRowPtr_h;
263:     Ma  = fs->csrVal_h;
264:     Mnz = Mi[m];
265:     for (PetscInt i = 0; i < m; i++) {
266:       PetscInt llen = Ai[i + 1] - Ai[i];
267:       PetscInt ulen = adiag[i] - adiag[i + 1];
268:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
269:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
270:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
271:     }
272:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

274: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
275:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
276:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
277:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
278:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
279:     } else
280: #endif
281:     {
282:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
283:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

285:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
286:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
287:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
288:     }
289:   }
290:   PetscFunctionReturn(PETSC_SUCCESS);
291: }

293: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
294: {
295:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
296:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
297:   IS                            isrow = a->row, isicol = a->icol;
298:   PetscBool                     row_identity, col_identity;
299:   PetscInt                      n = A->rmap->n;

301:   PetscFunctionBegin;
302:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
303:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));

305:   cusparseTriFactors->nnz = a->nz;

307:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
308:   /* lower triangular indices */
309:   PetscCall(ISIdentity(isrow, &row_identity));
310:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
311:     const PetscInt *r;

313:     PetscCall(ISGetIndices(isrow, &r));
314:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
315:     cusparseTriFactors->rpermIndices->assign(r, r + n);
316:     PetscCall(ISRestoreIndices(isrow, &r));
317:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
318:   }

320:   /* upper triangular indices */
321:   PetscCall(ISIdentity(isicol, &col_identity));
322:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
323:     const PetscInt *c;

325:     PetscCall(ISGetIndices(isicol, &c));
326:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
327:     cusparseTriFactors->cpermIndices->assign(c, c + n);
328:     PetscCall(ISRestoreIndices(isicol, &c));
329:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
330:   }
331:   PetscFunctionReturn(PETSC_SUCCESS);
332: }

334: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
335: {
336:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
337:   PetscInt                      m  = A->rmap->n;
338:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
339:   const PetscInt               *Ai = a->i, *Aj = a->j, *adiag;
340:   const MatScalar              *Aa = a->a;
341:   PetscInt                     *Mj, Mnz;
342:   PetscScalar                  *Ma, *D;

344:   PetscFunctionBegin;
345:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
346:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
347:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
348:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
349:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
350:       Mnz = Ai[m]; // Unz (with the unit diagonal)
351:       PetscCall(PetscMalloc1(Mnz, &Ma));
352:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
353:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
354:       for (PetscInt i = 0; i < m; i++) {
355:         PetscInt ulen = Ai[i + 1] - Ai[i];
356:         Mj[Ai[i]]     = i;                                              // diagonal entry
357:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
358:       }
359:       // Copy M (U) from host to device
360:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
361:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
362:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
363:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
364:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
365:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

367:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
368:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
369:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
370:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
371:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
372:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
373:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
374:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

376:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
377:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
378:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

380:       // Allocate work vectors in SpSv
381:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
382:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

384:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
385:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

387:       // Query buffer sizes for SpSV and then allocate buffers
388:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
389:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
390:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

392:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
393:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
394:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

396:       // Record for reuse
397:       fs->csrVal_h = Ma;
398:       fs->diag_h   = D;
399:       PetscCall(PetscFree(Mj));
400:     }
401:     // Copy the value
402:     Ma  = fs->csrVal_h;
403:     D   = fs->diag_h;
404:     Mnz = Ai[m];
405:     for (PetscInt i = 0; i < m; i++) {
406:       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
407:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
408:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
409:     }
410:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
411:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

413: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
414:     if (fs->updatedSpSVAnalysis) {
415:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
416:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
417:     } else
418: #endif
419:     {
420:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
421:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
422:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
423:       fs->updatedSpSVAnalysis = PETSC_TRUE;
424:     }
425:   }
426:   PetscFunctionReturn(PETSC_SUCCESS);
427: }

429: // Solve Ut D U x = b
430: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
431: {
432:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
433:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
434:   const PetscScalar                    *barray;
435:   PetscScalar                          *xarray;
436:   thrust::device_ptr<const PetscScalar> bGPU;
437:   thrust::device_ptr<PetscScalar>       xGPU;
438:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
439:   PetscInt                              m   = A->rmap->n;

441:   PetscFunctionBegin;
442:   PetscCall(PetscLogGpuTimeBegin());
443:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
444:   PetscCall(VecCUDAGetArrayRead(b, &barray));
445:   xGPU = thrust::device_pointer_cast(xarray);
446:   bGPU = thrust::device_pointer_cast(barray);

448:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
449:   if (fs->rpermIndices) {
450:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
451:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
452:   } else {
453:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
454:   }

456:   // Solve Ut Y = X
457:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
458:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

460:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
461:   // It is basically a vector element-wise multiplication, but cublas does not have it!
462: #if CCCL_VERSION >= 3001000
463:   auto multiplies = cuda::std::multiplies<PetscScalar>();
464: #else
465:   auto multiplies = thrust::multiplies<PetscScalar>();
466: #endif
467:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));

469:   // Solve U X = Y
470:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
471:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
472:   } else {
473:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
474:   }
475:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

477:   // Reorder X with the column permutation if needed, and put the result back to x
478:   if (fs->cpermIndices) {
479:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
480:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
481:   }

483:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
484:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
485:   PetscCall(PetscLogGpuTimeEnd());
486:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
487:   PetscFunctionReturn(PETSC_SUCCESS);
488: }

490: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
491: {
492:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
493:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
494:   IS                            ip                 = a->row;
495:   PetscBool                     perm_identity;
496:   PetscInt                      n = A->rmap->n;

498:   PetscFunctionBegin;
499:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

501:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
502:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

504:   A->offloadmask = PETSC_OFFLOAD_BOTH;

506:   /* lower triangular indices */
507:   PetscCall(ISIdentity(ip, &perm_identity));
508:   if (!perm_identity) {
509:     IS              iip;
510:     const PetscInt *irip, *rip;

512:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
513:     PetscCall(ISGetIndices(iip, &irip));
514:     PetscCall(ISGetIndices(ip, &rip));
515:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
516:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
517:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
518:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
519:     PetscCall(ISRestoreIndices(iip, &irip));
520:     PetscCall(ISDestroy(&iip));
521:     PetscCall(ISRestoreIndices(ip, &rip));
522:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
523:   }
524:   PetscFunctionReturn(PETSC_SUCCESS);
525: }

527: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
528: {
529:   PetscFunctionBegin;
530:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
531:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
532:   B->offloadmask            = PETSC_OFFLOAD_CPU;
533:   B->ops->solve             = MatSolve_SeqAIJCUSPARSE_Cholesky;
534:   B->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_Cholesky;
535:   B->ops->matsolve          = NULL;
536:   B->ops->matsolvetranspose = NULL;
537:   /* get the triangular factors */
538:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
539:   PetscFunctionReturn(PETSC_SUCCESS);
540: }

542: struct PetscScalarToPetscInt {
543:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
544: };

546: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
547: {
548:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
549:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
550:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
551:   cusparseStatus_t              stat;
552:   cusparseIndexBase_t           indexBase;

554:   PetscFunctionBegin;
555:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
556:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
557:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
558:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
559:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
560:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
561:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
562:   PetscCall(PetscLogGpuTimeBegin());
563:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
564:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
565:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
566:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
567:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
568:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
569:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

571:     /* set alpha and beta */
572:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
573:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
574:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
575:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
576:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
577:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

579:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
580:       CsrMatrix *matrixT      = new CsrMatrix;
581:       matstructT->mat         = matrixT;
582:       matrixT->num_rows       = A->cmap->n;
583:       matrixT->num_cols       = A->rmap->n;
584:       matrixT->num_entries    = a->nz;
585:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
586:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
587:       matrixT->values         = new THRUSTARRAY(a->nz);

589:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
590:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
591:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
592:                                indexBase, cusparse_scalartype);
593:       PetscCallCUSPARSE(stat);
594:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
595:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
596:     }
597:   }
598:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
599:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
600:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
601:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
602:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
603:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
604:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
605:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
606:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
607:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
608:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
609:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
610:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
611:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
612:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
613:     }
614:     if (!cusparsestruct->csr2csc_i) {
615:       THRUSTARRAY csr2csc_a(matrix->num_entries);
616:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

618:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
619:       void  *csr2cscBuffer;
620:       size_t csr2cscBufferSize;
621:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
622:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
623:       PetscCallCUSPARSE(stat);
624:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));

626:       if (matrix->num_entries) {
627:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
628:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
629:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

631:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
632:            should be filled with indexBase. So I just take a shortcut here.
633:         */
634:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
635:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
636:         PetscCallCUSPARSE(stat);
637:       } else {
638:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
639:       }

641:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
642:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
643:       PetscCallCUDA(cudaFree(csr2cscBuffer));
644:     }
645:     PetscCallThrust(
646:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
647:   }
648:   PetscCall(PetscLogGpuTimeEnd());
649:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
650:   /* the compressed row indices is not used for matTranspose */
651:   matstructT->cprowIndices = NULL;
652:   /* assign the pointer */
653:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
654:   A->transupdated                                = PETSC_TRUE;
655:   PetscFunctionReturn(PETSC_SUCCESS);
656: }

658: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
659: {
660:   const PetscScalar                    *barray;
661:   PetscScalar                          *xarray;
662:   thrust::device_ptr<const PetscScalar> bGPU;
663:   thrust::device_ptr<PetscScalar>       xGPU;
664:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
665:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
666:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
667:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
668:   PetscInt                              m   = A->rmap->n;

670:   PetscFunctionBegin;
671:   PetscCall(PetscLogGpuTimeBegin());
672:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
673:   PetscCall(VecCUDAGetArrayRead(b, &barray));
674:   xGPU = thrust::device_pointer_cast(xarray);
675:   bGPU = thrust::device_pointer_cast(barray);

677:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
678:   if (fs->rpermIndices) {
679:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
680:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
681:   } else {
682:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
683:   }

685:   // Solve L Y = X
686:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
687:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
688:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

690:   // Solve U X = Y
691:   if (fs->cpermIndices) {
692:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
693:   } else {
694:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
695:   }
696:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

698:   // Reorder X with the column permutation if needed, and put the result back to x
699:   if (fs->cpermIndices) {
700:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
701:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
702:   }
703:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
704:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
705:   PetscCall(PetscLogGpuTimeEnd());
706:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
707:   PetscFunctionReturn(PETSC_SUCCESS);
708: }

710: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
711: {
712:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
713:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
714:   const PetscScalar                    *barray;
715:   PetscScalar                          *xarray;
716:   thrust::device_ptr<const PetscScalar> bGPU;
717:   thrust::device_ptr<PetscScalar>       xGPU;
718:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
719:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720:   PetscInt                              m   = A->rmap->n;

722:   PetscFunctionBegin;
723:   PetscCall(PetscLogGpuTimeBegin());
724:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
725:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
726:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
727:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

729:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
730:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
731:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
732:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
733:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
734:   }

736:   if (!fs->updatedTransposeSpSVAnalysis) {
737:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

739:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
740:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
741:   }

743:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
744:   PetscCall(VecCUDAGetArrayRead(b, &barray));
745:   xGPU = thrust::device_pointer_cast(xarray);
746:   bGPU = thrust::device_pointer_cast(barray);

748:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
749:   if (fs->rpermIndices) {
750:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
751:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
752:   } else {
753:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
754:   }

756:   // Solve Ut Y = X
757:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
758:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

760:   // Solve Lt X = Y
761:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
762:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
763:   } else {
764:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
765:   }
766:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

768:   // Reorder X with the column permutation if needed, and put the result back to x
769:   if (fs->cpermIndices) {
770:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
771:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
772:   }

774:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
775:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
776:   PetscCall(PetscLogGpuTimeEnd());
777:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
778:   PetscFunctionReturn(PETSC_SUCCESS);
779: }

781: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
782: {
783:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
784:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
785:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
786:   CsrMatrix                    *Acsr;
787:   PetscInt                      m, nz;
788:   PetscBool                     flg;

790:   PetscFunctionBegin;
791:   if (PetscDefined(USE_DEBUG)) {
792:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
793:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
794:   }

796:   /* Copy A's value to fact */
797:   m  = fact->rmap->n;
798:   nz = aij->nz;
799:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
800:   Acsr = (CsrMatrix *)Acusp->mat->mat;
801:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

803:   PetscCall(PetscLogGpuTimeBegin());
804:   /* Factorize fact inplace */
805:   if (m)
806:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
807:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
808:   if (PetscDefined(USE_DEBUG)) {
809:     int              numerical_zero;
810:     cusparseStatus_t status;
811:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
812:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
813:   }

815: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
816:   if (fs->updatedSpSVAnalysis) {
817:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
818:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
819:   } else
820: #endif
821:   {
822:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
823:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
824:     */
825:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

827:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

829:     fs->updatedSpSVAnalysis = PETSC_TRUE;
830:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
831:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
832:   }

834:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
835:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
836:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
837:   fact->ops->matsolve          = NULL;
838:   fact->ops->matsolvetranspose = NULL;
839:   PetscCall(PetscLogGpuTimeEnd());
840:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
841:   PetscFunctionReturn(PETSC_SUCCESS);
842: }

844: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
845: {
846:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
847:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
848:   PetscInt                      m, nz;

850:   PetscFunctionBegin;
851:   if (PetscDefined(USE_DEBUG)) {
852:     PetscBool flg, diagDense;

854:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
855:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
856:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
857:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
858:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
859:   }

861:   /* Free the old stale stuff */
862:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

864:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
865:      but they will not be used. Allocate them just for easy debugging.
866:    */
867:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

869:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
870:   fact->factortype             = MAT_FACTOR_ILU;
871:   fact->info.factor_mallocs    = 0;
872:   fact->info.fill_ratio_given  = info->fill;
873:   fact->info.fill_ratio_needed = 1.0;

875:   aij->row = NULL;
876:   aij->col = NULL;

878:   /* ====================================================================== */
879:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
880:   /* We'll do in-place factorization on fact                                */
881:   /* ====================================================================== */
882:   const int *Ai, *Aj;

884:   m  = fact->rmap->n;
885:   nz = aij->nz;

887:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
888:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
889:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
890:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
891:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
892:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

894:   /* ====================================================================== */
895:   /* Create descriptors for M, L, U                                         */
896:   /* ====================================================================== */
897:   cusparseFillMode_t fillMode;
898:   cusparseDiagType_t diagType;

900:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
901:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
902:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

904:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
905:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
906:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
907:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
908:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
909:   */
910:   fillMode = CUSPARSE_FILL_MODE_LOWER;
911:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
912:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
913:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
914:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

916:   fillMode = CUSPARSE_FILL_MODE_UPPER;
917:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
918:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
919:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
920:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

922:   /* ========================================================================= */
923:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
924:   /* ========================================================================= */
925:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
926:   if (m)
927:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
928:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

930:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
931:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

933:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
934:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

936:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
937:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

939:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
940:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

942:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
943:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
944:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
945:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
946:    */
947:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
948:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
949:     fs->spsvBuffer_L = fs->factBuffer_M;
950:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
951:   } else {
952:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
953:     fs->spsvBuffer_U = fs->factBuffer_M;
954:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
955:   }

957:   /* ========================================================================== */
958:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
959:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
960:   /* ========================================================================== */
961:   int              structural_zero;
962:   cusparseStatus_t status;

964:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
965:   if (m)
966:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
967:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
968:   if (PetscDefined(USE_DEBUG)) {
969:     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
970:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
971:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
972:   }

974:   /* Estimate FLOPs of the numeric factorization */
975:   {
976:     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
977:     PetscInt       *Ai, nzRow, nzLeft;
978:     const PetscInt *adiag;
979:     PetscLogDouble  flops = 0.0;

981:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
982:     Ai = Aseq->i;
983:     for (PetscInt i = 0; i < m; i++) {
984:       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
985:         nzRow  = Ai[i + 1] - Ai[i];
986:         nzLeft = adiag[i] - Ai[i];
987:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
988:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
989:         */
990:         nzLeft = (nzRow - 1) / 2;
991:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
992:       }
993:     }
994:     fs->numericFactFlops = flops;
995:   }
996:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
997:   PetscFunctionReturn(PETSC_SUCCESS);
998: }

1000: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1001: {
1002:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1003:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1004:   const PetscScalar            *barray;
1005:   PetscScalar                  *xarray;

1007:   PetscFunctionBegin;
1008:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1009:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1010:   PetscCall(PetscLogGpuTimeBegin());

1012:   /* Solve L*y = b */
1013:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1014:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1015:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1016:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1018:   /* Solve Lt*x = y */
1019:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1020:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1021:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1023:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1024:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1026:   PetscCall(PetscLogGpuTimeEnd());
1027:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1028:   PetscFunctionReturn(PETSC_SUCCESS);
1029: }

1031: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1032: {
1033:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1034:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1035:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1036:   CsrMatrix                    *Acsr;
1037:   PetscInt                      m, nz;
1038:   PetscBool                     flg;

1040:   PetscFunctionBegin;
1041:   if (PetscDefined(USE_DEBUG)) {
1042:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1043:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1044:   }

1046:   /* Copy A's value to fact */
1047:   m  = fact->rmap->n;
1048:   nz = aij->nz;
1049:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1050:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1051:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1053:   /* Factorize fact inplace */
1054:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1055:      csric02() only takes the lower triangular part of matrix A to perform factorization.
1056:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1057:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1058:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1059:    */
1060:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1061:   if (PetscDefined(USE_DEBUG)) {
1062:     int              numerical_zero;
1063:     cusparseStatus_t status;
1064:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1065:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1066:   }

1068: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1069:   if (fs->updatedSpSVAnalysis) {
1070:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1071:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1072:   } else
1073: #endif
1074:   {
1075:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1077:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1078:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1079:   */
1080:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1081:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1082:   }

1084:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1085:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1086:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1087:   fact->ops->matsolve          = NULL;
1088:   fact->ops->matsolvetranspose = NULL;
1089:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1090:   PetscFunctionReturn(PETSC_SUCCESS);
1091: }

1093: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1094: {
1095:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1096:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1097:   PetscInt                      m, nz;

1099:   PetscFunctionBegin;
1100:   if (PetscDefined(USE_DEBUG)) {
1101:     PetscBool flg, diagDense;

1103:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1104:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1105:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1106:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1107:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1108:   }

1110:   /* Free the old stale stuff */
1111:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1113:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1114:      but they will not be used. Allocate them just for easy debugging.
1115:    */
1116:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1118:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1119:   fact->factortype             = MAT_FACTOR_ICC;
1120:   fact->info.factor_mallocs    = 0;
1121:   fact->info.fill_ratio_given  = info->fill;
1122:   fact->info.fill_ratio_needed = 1.0;

1124:   aij->row = NULL;
1125:   aij->col = NULL;

1127:   /* ====================================================================== */
1128:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1129:   /* We'll do in-place factorization on fact                                */
1130:   /* ====================================================================== */
1131:   const int *Ai, *Aj;

1133:   m  = fact->rmap->n;
1134:   nz = aij->nz;

1136:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1137:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1138:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1139:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1140:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1141:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1143:   /* ====================================================================== */
1144:   /* Create mat descriptors for M, L                                        */
1145:   /* ====================================================================== */
1146:   cusparseFillMode_t fillMode;
1147:   cusparseDiagType_t diagType;

1149:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1150:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1151:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1153:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1154:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1155:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1156:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1157:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1158:   */
1159:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1160:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1161:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1162:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1163:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1165:   /* ========================================================================= */
1166:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1167:   /* ========================================================================= */
1168:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1169:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

1171:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1172:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1174:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1175:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1177:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1178:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1180:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1181:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1183:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1184:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1185:    */
1186:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1187:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1188:     fs->spsvBuffer_L = fs->factBuffer_M;
1189:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1190:   } else {
1191:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1192:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1193:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1194:   }

1196:   /* ========================================================================== */
1197:   /* Perform analysis of ic0 on M                                               */
1198:   /* The lower triangular part of M has the same sparsity pattern as L          */
1199:   /* ========================================================================== */
1200:   int              structural_zero;
1201:   cusparseStatus_t status;

1203:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1204:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1205:   if (PetscDefined(USE_DEBUG)) {
1206:     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1207:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1208:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1209:   }

1211:   /* Estimate FLOPs of the numeric factorization */
1212:   {
1213:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1214:     PetscInt      *Ai, nzRow, nzLeft;
1215:     PetscLogDouble flops = 0.0;

1217:     Ai = Aseq->i;
1218:     for (PetscInt i = 0; i < m; i++) {
1219:       nzRow = Ai[i + 1] - Ai[i];
1220:       if (nzRow > 1) {
1221:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1222:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1223:         */
1224:         nzLeft = (nzRow - 1) / 2;
1225:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1226:       }
1227:     }
1228:     fs->numericFactFlops = flops;
1229:   }
1230:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1231:   PetscFunctionReturn(PETSC_SUCCESS);
1232: }

1234: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1235: {
1236:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
1237:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

1239:   PetscFunctionBegin;
1240:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1241:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1242:   B->offloadmask = PETSC_OFFLOAD_CPU;

1244:   if (!cusparsestruct->use_cpu_solve) {
1245:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
1246:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1247:   }
1248:   B->ops->matsolve          = NULL;
1249:   B->ops->matsolvetranspose = NULL;

1251:   /* get the triangular factors */
1252:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
1253:   PetscFunctionReturn(PETSC_SUCCESS);
1254: }

1256: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1257: {
1258:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

1260:   PetscFunctionBegin;
1261:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1262:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1263:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1264:   PetscFunctionReturn(PETSC_SUCCESS);
1265: }

1267: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1268: {
1269:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1271:   PetscFunctionBegin;
1272:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1273:   if (!info->factoronhost) {
1274:     PetscCall(ISIdentity(isrow, &row_identity));
1275:     PetscCall(ISIdentity(iscol, &col_identity));
1276:   }
1277:   if (!info->levels && row_identity && col_identity) {
1278:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1279:   } else {
1280:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1281:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1282:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1283:   }
1284:   PetscFunctionReturn(PETSC_SUCCESS);
1285: }

1287: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1288: {
1289:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1291:   PetscFunctionBegin;
1292:   PetscBool perm_identity = PETSC_FALSE;
1293:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1294:   if (!info->levels && perm_identity) {
1295:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1296:   } else {
1297:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1298:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1299:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1300:   }
1301:   PetscFunctionReturn(PETSC_SUCCESS);
1302: }

1304: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1305: {
1306:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1308:   PetscFunctionBegin;
1309:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1310:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1311:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1312:   PetscFunctionReturn(PETSC_SUCCESS);
1313: }

1315: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1316: {
1317:   PetscFunctionBegin;
1318:   *type = MATSOLVERCUSPARSE;
1319:   PetscFunctionReturn(PETSC_SUCCESS);
1320: }

1322: /*MC
1323:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1324:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1325:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1326:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1327:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1328:   algorithms are not recommended. This class does NOT support direct solver operations.

1330:   Level: beginner

1332: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1333:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1334: M*/

1336: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1337: {
1338:   PetscInt n = A->rmap->n;

1340:   PetscFunctionBegin;
1341:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1342:   PetscCall(MatSetSizes(*B, n, n, n, n));
1343:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
1344:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

1346:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1347:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1348:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1349:     if (!A->boundtocpu) {
1350:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1351:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1352:     } else {
1353:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1354:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1355:     }
1356:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1357:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1358:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1359:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1360:     if (!A->boundtocpu) {
1361:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1362:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1363:     } else {
1364:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1365:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1366:     }
1367:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1368:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1369:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

1371:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1372:   (*B)->canuseordering = PETSC_TRUE;
1373:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
1374:   PetscFunctionReturn(PETSC_SUCCESS);
1375: }

1377: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1378: {
1379:   Mat_SeqAIJ                   *a    = (Mat_SeqAIJ *)A->data;
1380:   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1381:   Mat_SeqAIJCUSPARSETriFactors *fs   = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;

1383:   PetscFunctionBegin;
1384:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1385:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1386:     if (A->factortype == MAT_FACTOR_NONE) {
1387:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1388:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1389:     } else if (fs->csrVal) {
1390:       /* We have a factorized matrix on device and are able to copy it to host */
1391:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1392:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1393:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1394:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1395:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1396:   }
1397:   PetscFunctionReturn(PETSC_SUCCESS);
1398: }

1400: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (CUDA specialisation) */
1401: struct MatSeqAIJCUSPARSE_Policy {
1402:   typedef Mat_SeqAIJCUSPARSE           mat_struct_type;
1403:   typedef Mat_SeqAIJCUSPARSEMultStruct mult_struct_type;

1405:   static int storage_format_csr() { return (int)MAT_CUSPARSE_CSR; }
1406:   static int storage_format_ell() { return (int)MAT_CUSPARSE_ELL; }
1407:   static int storage_format_hyb() { return (int)MAT_CUSPARSE_HYB; }

1409:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
1410:   static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJCUSPARSECopyFromGPU(A); }
1411:   static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJCUSPARSEInvalidateTranspose(A, d); }
1412:   static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, t, r, C); }
1413:   static const char    *mat_type_name;

1415:   static PetscErrorCode Destroy(Mat A) { return MatSeqAIJCUSPARSE_Destroy(A); }
1416:   static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)spptr); }
1417:   static const char    *set_format_c;
1418:   static const char    *set_use_cpu_solve_c;
1419:   static const char    *product_seqdense_device_c;
1420:   static const char    *product_seqdense_c;
1421:   static const char    *product_self_c;
1422:   static const char    *seq_convert_hypre_c;

1424:   static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecCUDAGetArrayRead(v, a); }
1425:   static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecCUDARestoreArrayRead(v, a); }
1426:   static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecCUDAGetArrayWrite(v, a); }
1427:   static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecCUDARestoreArrayWrite(v, a); }
1428: };
1429: const char *MatSeqAIJCUSPARSE_Policy::mat_type_name             = MATSEQAIJCUSPARSE;
1430: const char *MatSeqAIJCUSPARSE_Policy::set_format_c              = "MatCUSPARSESetFormat_C";
1431: const char *MatSeqAIJCUSPARSE_Policy::set_use_cpu_solve_c       = "MatCUSPARSESetUseCPUSolve_C";
1432: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C";
1433: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_c        = "MatProductSetFromOptions_seqaijcusparse_seqdense_C";
1434: const char *MatSeqAIJCUSPARSE_Policy::product_self_c            = "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C";
1435: const char *MatSeqAIJCUSPARSE_Policy::seq_convert_hypre_c       = "MatConvert_seqaijcusparse_hypre_C";

1437: using MatSeqAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatSeqAIJCUSPARSE_Policy>;

1439: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1440: {
1441:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1442: }

1444: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1445: {
1446:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1447: }

1449: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1450: {
1451:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1452: }

1454: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1455: {
1456:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1457: }

1459: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1460: {
1461:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1462: }

1464: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1465: {
1466:   return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1467: }

1469: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1470: {
1471:   Mat_SeqAIJCUSPARSE *cusp;
1472:   CsrMatrix          *matrix;

1474:   PetscFunctionBegin;
1475:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1476:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1477:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1478:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1479:   matrix = (CsrMatrix *)cusp->mat->mat;

1481:   if (i) {
1482: #if !defined(PETSC_USE_64BIT_INDICES)
1483:     *i = matrix->row_offsets->data().get();
1484: #else
1485:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
1486: #endif
1487:   }
1488:   if (j) {
1489: #if !defined(PETSC_USE_64BIT_INDICES)
1490:     *j = matrix->column_indices->data().get();
1491: #else
1492:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
1493: #endif
1494:   }
1495:   if (a) *a = matrix->values->data().get();
1496:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1497:   PetscFunctionReturn(PETSC_SUCCESS);
1498: }

1500: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1501: {
1502:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1503:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
1504:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
1505:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
1506:   cusparseStatus_t              stat;
1507:   PetscBool                     both = PETSC_TRUE;

1509:   PetscFunctionBegin;
1510:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1511:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1512:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1513:       CsrMatrix *matrix;
1514:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

1516:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1517:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1518:       matrix->values->assign(a->a, a->a + a->nz);
1519:       PetscCallCUDA(WaitForCUDA());
1520:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1521:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1522:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
1523:     } else {
1524:       PetscInt nnz;
1525:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1526:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
1527:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1528:       delete cusparsestruct->workVector;
1529:       delete cusparsestruct->rowoffsets_gpu;
1530:       cusparsestruct->workVector     = NULL;
1531:       cusparsestruct->rowoffsets_gpu = NULL;
1532:       try {
1533:         if (a->compressedrow.use) {
1534:           m    = a->compressedrow.nrows;
1535:           ii   = a->compressedrow.i;
1536:           ridx = a->compressedrow.rindex;
1537:         } else {
1538:           m    = A->rmap->n;
1539:           ii   = a->i;
1540:           ridx = NULL;
1541:         }
1542:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1543:         if (!a->a) {
1544:           nnz  = ii[m];
1545:           both = PETSC_FALSE;
1546:         } else nnz = a->nz;
1547:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

1549:         /* create cusparse matrix */
1550:         cusparsestruct->nrows = m;
1551:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
1552:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1553:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1554:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1556:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1557:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1558:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1559:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1560:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1561:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1562:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

1564:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1565:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1566:           /* set the matrix */
1567:           CsrMatrix *mat   = new CsrMatrix;
1568:           mat->num_rows    = m;
1569:           mat->num_cols    = A->cmap->n;
1570:           mat->num_entries = nnz;
1571:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
1572:           mat->row_offsets->assign(ii, ii + m + 1);
1573:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
1574:           mat->column_indices->assign(a->j, a->j + nnz);

1576:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1577:           if (a->a) mat->values->assign(a->a, a->a + nnz);

1579:           /* assign the pointer */
1580:           matstruct->mat = mat;
1581:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1582:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1583:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1584:             PetscCallCUSPARSE(stat);
1585:           }
1586:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1587:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1588:         }

1590:         /* assign the compressed row indices */
1591:         if (a->compressedrow.use) {
1592:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
1593:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1594:           matstruct->cprowIndices->assign(ridx, ridx + m);
1595:           tmp = m;
1596:         } else {
1597:           cusparsestruct->workVector = NULL;
1598:           matstruct->cprowIndices    = NULL;
1599:           tmp                        = 0;
1600:         }
1601:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

1603:         /* assign the pointer */
1604:         cusparsestruct->mat = matstruct;
1605:       } catch (char *ex) {
1606:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
1607:       }
1608:       PetscCallCUDA(WaitForCUDA());
1609:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1610:       cusparsestruct->nonzerostate = A->nonzerostate;
1611:     }
1612:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1613:   }
1614:   PetscFunctionReturn(PETSC_SUCCESS);
1615: }

1617: struct VecCUDAPlusEquals {
1618:   template <typename Tuple>
1619:   __host__ __device__ void operator()(Tuple t)
1620:   {
1621:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1622:   }
1623: };

1625: struct VecCUDAEquals {
1626:   template <typename Tuple>
1627:   __host__ __device__ void operator()(Tuple t)
1628:   {
1629:     thrust::get<1>(t) = thrust::get<0>(t);
1630:   }
1631: };

1633: struct VecCUDAEqualsReverse {
1634:   template <typename Tuple>
1635:   __host__ __device__ void operator()(Tuple t)
1636:   {
1637:     thrust::get<0>(t) = thrust::get<1>(t);
1638:   }
1639: };

1641: struct MatProductCtx_MatMatCusparse {
1642:   PetscBool      cisdense;
1643:   PetscScalar   *Bt;
1644:   Mat            X;
1645:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1646:   PetscLogDouble flops;
1647:   CsrMatrix     *Bcsr;

1649:   cusparseSpMatDescr_t  matSpBDescr;
1650:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
1651:   cusparseDnMatDescr_t  matBDescr;
1652:   cusparseDnMatDescr_t  matCDescr;
1653:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1654:   void                 *dBuffer4;
1655:   void                 *dBuffer5;
1656:   size_t                mmBufferSize;
1657:   void                 *mmBuffer;
1658:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1659:   cusparseSpGEMMDescr_t spgemmDesc;
1660: };

1662: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
1663: {
1664:   MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;

1666:   PetscFunctionBegin;
1667:   PetscCallCUDA(cudaFree(mmdata->Bt));
1668:   delete mmdata->Bcsr;
1669:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
1670:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1671:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1672:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1673:   PetscCallCUDA(cudaFree(mmdata->dBuffer4));
1674:   PetscCallCUDA(cudaFree(mmdata->dBuffer5));
1675:   PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1676:   PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
1677:   PetscCall(MatDestroy(&mmdata->X));
1678:   PetscCall(PetscFree(mmdata));
1679:   PetscFunctionReturn(PETSC_SUCCESS);
1680: }

1682: #include <../src/mat/impls/dense/seq/dense.h>

1684: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1685: {
1686:   Mat_Product                  *product = C->product;
1687:   Mat                           A, B;
1688:   PetscInt                      m, n, blda, clda;
1689:   PetscBool                     flg, biscuda;
1690:   Mat_SeqAIJCUSPARSE           *cusp;
1691:   cusparseStatus_t              stat;
1692:   cusparseOperation_t           opA;
1693:   const PetscScalar            *barray;
1694:   PetscScalar                  *carray;
1695:   MatProductCtx_MatMatCusparse *mmdata;
1696:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1697:   CsrMatrix                    *csrmat;

1699:   PetscFunctionBegin;
1700:   MatCheckProduct(C, 1);
1701:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1702:   mmdata = (MatProductCtx_MatMatCusparse *)product->data;
1703:   A      = product->A;
1704:   B      = product->B;
1705:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1706:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1707:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1708:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1709:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1710:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1711:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1712:   switch (product->type) {
1713:   case MATPRODUCT_AB:
1714:   case MATPRODUCT_PtAP:
1715:     mat = cusp->mat;
1716:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1717:     m   = A->rmap->n;
1718:     n   = B->cmap->n;
1719:     break;
1720:   case MATPRODUCT_AtB:
1721:     if (!A->form_explicit_transpose) {
1722:       mat = cusp->mat;
1723:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1724:     } else {
1725:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1726:       mat = cusp->matTranspose;
1727:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1728:     }
1729:     m = A->cmap->n;
1730:     n = B->cmap->n;
1731:     break;
1732:   case MATPRODUCT_ABt:
1733:   case MATPRODUCT_RARt:
1734:     mat = cusp->mat;
1735:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1736:     m   = A->rmap->n;
1737:     n   = B->rmap->n;
1738:     break;
1739:   default:
1740:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1741:   }
1742:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
1743:   csrmat = (CsrMatrix *)mat->mat;
1744:   /* if the user passed a CPU matrix, copy the data to the GPU */
1745:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
1746:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1747:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

1749:   PetscCall(MatDenseGetLDA(B, &blda));
1750:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1751:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1752:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1753:   } else {
1754:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1755:     PetscCall(MatDenseGetLDA(C, &clda));
1756:   }

1758:   PetscCall(PetscLogGpuTimeBegin());
1759:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1760: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
1761:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
1762: #else
1763:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
1764: #endif

1766:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
1767:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1768:     size_t mmBufferSize;
1769:     if (mmdata->initialized && mmdata->Blda != blda) {
1770:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1771:       mmdata->matBDescr = NULL;
1772:     }
1773:     if (!mmdata->matBDescr) {
1774:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1775:       mmdata->Blda = blda;
1776:     }

1778:     if (mmdata->initialized && mmdata->Clda != clda) {
1779:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1780:       mmdata->matCDescr = NULL;
1781:     }
1782:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1783:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1784:       mmdata->Clda = clda;
1785:     }

1787: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
1788:     if (matADescr) {
1789:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
1790:       matADescr = NULL;
1791:     }
1792: #endif

1794:     if (!matADescr) {
1795:       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1796:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1797:       PetscCallCUSPARSE(stat);
1798:     }

1800:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

1802:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1803:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1804:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
1805:       mmdata->mmBufferSize = mmBufferSize;
1806:     }

1808: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
1809:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1810: #endif

1812:     mmdata->initialized = PETSC_TRUE;
1813:   } else {
1814:     /* to be safe, always update pointers of the mats */
1815:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
1816:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1817:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1818:   }

1820:   /* do cusparseSpMM, which supports transpose on B */
1821:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

1823:   PetscCall(PetscLogGpuTimeEnd());
1824:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1825:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1826:   if (product->type == MATPRODUCT_RARt) {
1827:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1828:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1829:   } else if (product->type == MATPRODUCT_PtAP) {
1830:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1831:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1832:   } else {
1833:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1834:   }
1835:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1836:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1837:   PetscFunctionReturn(PETSC_SUCCESS);
1838: }

1840: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1841: {
1842:   Mat_Product                  *product = C->product;
1843:   Mat                           A, B;
1844:   PetscInt                      m, n;
1845:   PetscBool                     cisdense, flg;
1846:   MatProductCtx_MatMatCusparse *mmdata;
1847:   Mat_SeqAIJCUSPARSE           *cusp;

1849:   PetscFunctionBegin;
1850:   MatCheckProduct(C, 1);
1851:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1852:   A = product->A;
1853:   B = product->B;
1854:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1855:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1856:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1857:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1858:   switch (product->type) {
1859:   case MATPRODUCT_AB:
1860:     m = A->rmap->n;
1861:     n = B->cmap->n;
1862:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
1863:     break;
1864:   case MATPRODUCT_AtB:
1865:     m = A->cmap->n;
1866:     n = B->cmap->n;
1867:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
1868:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1869:     break;
1870:   case MATPRODUCT_ABt:
1871:     m = A->rmap->n;
1872:     n = B->rmap->n;
1873:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
1874:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1875:     break;
1876:   case MATPRODUCT_PtAP:
1877:     m = B->cmap->n;
1878:     n = B->cmap->n;
1879:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
1880:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1881:     break;
1882:   case MATPRODUCT_RARt:
1883:     m = B->rmap->n;
1884:     n = B->rmap->n;
1885:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
1886:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1887:     break;
1888:   default:
1889:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1890:   }
1891:   PetscCall(MatSetSizes(C, m, n, m, n));
1892:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1893:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1894:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

1896:   /* product data */
1897:   PetscCall(PetscNew(&mmdata));
1898:   mmdata->cisdense = cisdense;
1899:   /* for these products we need intermediate storage */
1900:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1901:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1902:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
1903:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1904:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1905:     } else {
1906:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1907:     }
1908:   }
1909:   C->product->data    = mmdata;
1910:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

1912:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1913:   PetscFunctionReturn(PETSC_SUCCESS);
1914: }

1916: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1917: {
1918:   Mat_Product                  *product = C->product;
1919:   Mat                           A, B;
1920:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
1921:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
1922:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1923:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
1924:   PetscBool                     flg;
1925:   cusparseStatus_t              stat;
1926:   MatProductType                ptype;
1927:   MatProductCtx_MatMatCusparse *mmdata;
1928:   cusparseSpMatDescr_t          BmatSpDescr;
1929:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

1931:   PetscFunctionBegin;
1932:   MatCheckProduct(C, 1);
1933:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1934:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
1935:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1936:   mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
1937:   A      = product->A;
1938:   B      = product->B;
1939:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1940:     mmdata->reusesym = PETSC_FALSE;
1941:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
1942:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1943:     Cmat = Ccusp->mat;
1944:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1945:     Ccsr = (CsrMatrix *)Cmat->mat;
1946:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1947:     goto finalize;
1948:   }
1949:   if (!c->nz) goto finalize;
1950:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1952:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
1953:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1954:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1955:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1956:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1957:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
1958:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1959:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1960:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1961:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1962:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1963:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

1965:   ptype = product->type;
1966:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1967:     ptype = MATPRODUCT_AB;
1968:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1969:   }
1970:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1971:     ptype = MATPRODUCT_AB;
1972:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1973:   }
1974:   switch (ptype) {
1975:   case MATPRODUCT_AB:
1976:     Amat = Acusp->mat;
1977:     Bmat = Bcusp->mat;
1978:     break;
1979:   case MATPRODUCT_AtB:
1980:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1981:     Amat = Acusp->matTranspose;
1982:     Bmat = Bcusp->mat;
1983:     break;
1984:   case MATPRODUCT_ABt:
1985:     Amat = Acusp->mat;
1986:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
1987:     Bmat = Bcusp->matTranspose;
1988:     break;
1989:   default:
1990:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1991:   }
1992:   Cmat = Ccusp->mat;
1993:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1994:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1995:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1996:   Acsr = (CsrMatrix *)Amat->mat;
1997:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1998:   Ccsr = (CsrMatrix *)Cmat->mat;
1999:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2000:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2001:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2002:   PetscCall(PetscLogGpuTimeBegin());
2003:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2004:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2005:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2006:   PetscCallCUSPARSE(stat);
2007:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2008:   PetscCallCUDA(WaitForCUDA());
2009:   PetscCall(PetscLogGpuTimeEnd());
2010:   C->offloadmask = PETSC_OFFLOAD_GPU;
2011: finalize:
2012:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2013:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2014:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2015:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2016:   c->reallocs = 0;
2017:   C->info.mallocs += 0;
2018:   C->info.nz_unneeded = 0;
2019:   C->assembled = C->was_assembled = PETSC_TRUE;
2020:   C->num_ass++;
2021:   PetscFunctionReturn(PETSC_SUCCESS);
2022: }

2024: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2025: {
2026:   Mat_Product                  *product = C->product;
2027:   Mat                           A, B;
2028:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2029:   Mat_SeqAIJ                   *a, *b, *c;
2030:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2031:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2032:   PetscInt                      i, j, m, n, k;
2033:   PetscBool                     flg;
2034:   cusparseStatus_t              stat;
2035:   MatProductType                ptype;
2036:   MatProductCtx_MatMatCusparse *mmdata;
2037:   PetscLogDouble                flops;
2038:   PetscBool                     biscompressed, ciscompressed;
2039:   int64_t                       C_num_rows1, C_num_cols1, C_nnz1;
2040:   cusparseSpMatDescr_t          BmatSpDescr;
2041:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2042:   void                         *dBuffer1 = NULL, *dBuffer2 = NULL, *dBuffer3 = NULL;
2043:   size_t                        bufferSize1 = 0, bufferSize2 = 0, bufferSize3 = 0, bufferSize4 = 0, bufferSize5 = 0;

2045:   PetscFunctionBegin;
2046:   MatCheckProduct(C, 1);
2047:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2048:   A = product->A;
2049:   B = product->B;
2050:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2051:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2052:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2053:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2054:   a = (Mat_SeqAIJ *)A->data;
2055:   b = (Mat_SeqAIJ *)B->data;
2056:   /* product data */
2057:   PetscCall(PetscNew(&mmdata));
2058:   C->product->data    = mmdata;
2059:   C->product->destroy = MatProductCtxDestroy_MatMatCusparse;

2061:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2062:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2063:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2064:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2065:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2066:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

2068:   ptype = product->type;
2069:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2070:     ptype                                          = MATPRODUCT_AB;
2071:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2072:   }
2073:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2074:     ptype                                          = MATPRODUCT_AB;
2075:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2076:   }
2077:   biscompressed = PETSC_FALSE;
2078:   ciscompressed = PETSC_FALSE;
2079:   switch (ptype) {
2080:   case MATPRODUCT_AB:
2081:     m    = A->rmap->n;
2082:     n    = B->cmap->n;
2083:     k    = A->cmap->n;
2084:     Amat = Acusp->mat;
2085:     Bmat = Bcusp->mat;
2086:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2087:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2088:     break;
2089:   case MATPRODUCT_AtB:
2090:     m = A->cmap->n;
2091:     n = B->cmap->n;
2092:     k = A->rmap->n;
2093:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2094:     Amat = Acusp->matTranspose;
2095:     Bmat = Bcusp->mat;
2096:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2097:     break;
2098:   case MATPRODUCT_ABt:
2099:     m = A->rmap->n;
2100:     n = B->rmap->n;
2101:     k = A->cmap->n;
2102:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2103:     Amat = Acusp->mat;
2104:     Bmat = Bcusp->matTranspose;
2105:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2106:     break;
2107:   default:
2108:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2109:   }

2111:   /* create cusparse matrix */
2112:   PetscCall(MatSetSizes(C, m, n, m, n));
2113:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2114:   c     = (Mat_SeqAIJ *)C->data;
2115:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2116:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2117:   Ccsr  = new CsrMatrix;

2119:   c->compressedrow.use = ciscompressed;
2120:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2121:     c->compressedrow.nrows = a->compressedrow.nrows;
2122:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2123:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2124:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2125:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2126:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2127:   } else {
2128:     c->compressedrow.nrows  = 0;
2129:     c->compressedrow.i      = NULL;
2130:     c->compressedrow.rindex = NULL;
2131:     Ccusp->workVector       = NULL;
2132:     Cmat->cprowIndices      = NULL;
2133:   }
2134:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2135:   Ccusp->mat        = Cmat;
2136:   Ccusp->mat->mat   = Ccsr;
2137:   Ccsr->num_rows    = Ccusp->nrows;
2138:   Ccsr->num_cols    = n;
2139:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2140:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2141:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2142:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2143:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2144:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2145:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2146:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2147:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2148:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2149:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2150:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2151:     c->nz                = 0;
2152:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2153:     Ccsr->values         = new THRUSTARRAY(c->nz);
2154:     goto finalizesym;
2155:   }

2157:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2158:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2159:   Acsr = (CsrMatrix *)Amat->mat;
2160:   if (!biscompressed) {
2161:     Bcsr        = (CsrMatrix *)Bmat->mat;
2162:     BmatSpDescr = Bmat->matDescr;
2163:   } else { /* we need to use row offsets for the full matrix */
2164:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2165:     Bcsr                 = new CsrMatrix;
2166:     Bcsr->num_rows       = B->rmap->n;
2167:     Bcsr->num_cols       = cBcsr->num_cols;
2168:     Bcsr->num_entries    = cBcsr->num_entries;
2169:     Bcsr->column_indices = cBcsr->column_indices;
2170:     Bcsr->values         = cBcsr->values;
2171:     if (!Bcusp->rowoffsets_gpu) {
2172:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2173:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2174:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2175:     }
2176:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2177:     mmdata->Bcsr      = Bcsr;
2178:     if (Bcsr->num_rows && Bcsr->num_cols) {
2179:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2180:       PetscCallCUSPARSE(stat);
2181:     }
2182:     BmatSpDescr = mmdata->matSpBDescr;
2183:   }
2184:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2185:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2186:   /* precompute flops count */
2187:   if (ptype == MATPRODUCT_AB) {
2188:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2189:       const PetscInt st = a->i[i];
2190:       const PetscInt en = a->i[i + 1];
2191:       for (j = st; j < en; j++) {
2192:         const PetscInt brow = a->j[j];
2193:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2194:       }
2195:     }
2196:   } else if (ptype == MATPRODUCT_AtB) {
2197:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2198:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2199:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2200:       flops += (2. * anzi) * bnzi;
2201:     }
2202:   } else { /* TODO */
2203:     flops = 0.;
2204:   }

2206:   mmdata->flops = flops;
2207:   PetscCall(PetscLogGpuTimeBegin());

2209:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2210:   // cuda-12.2 requires non-null csrRowOffsets
2211:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2212:   PetscCallCUSPARSE(stat);
2213:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2214:   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2215:     dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata
2216:     We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2217:   */
2218:   /* ask bufferSize1 bytes for external memory */
2219:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2220:   PetscCallCUSPARSE(stat);
2221:   PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2222:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2223:   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2224:   PetscCallCUSPARSE(stat);

2226:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2227:   PetscCallCUSPARSE(stat);
2228:   PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2229:   PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2230:   PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2231:   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2232:   PetscCallCUSPARSE(stat);
2233:   PetscCallCUDA(cudaFree(dBuffer1));
2234:   PetscCallCUDA(cudaFree(dBuffer2));

2236:   /* get matrix C non-zero entries C_nnz1 */
2237:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2238:   c->nz = (PetscInt)C_nnz1;
2239:   /* allocate matrix C */
2240:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2241:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2242:   Ccsr->values = new THRUSTARRAY(c->nz);
2243:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2244:   /* update matC with the new pointers */
2245:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2246:   PetscCallCUSPARSE(stat);

2248:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2249:   PetscCallCUSPARSE(stat);
2250:   PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2251:   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2252:   PetscCallCUSPARSE(stat);
2253:   PetscCallCUDA(cudaFree(dBuffer3));
2254:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2255:   PetscCallCUSPARSE(stat);
2256:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2257:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2258:   PetscCall(PetscLogGpuTimeEnd());
2259: finalizesym:
2260:   c->free_a = PETSC_TRUE;
2261:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2262:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2263:   c->free_ij = PETSC_TRUE;
2264:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2265:     PetscInt      *d_i = c->i;
2266:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2267:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2268:     ii = *Ccsr->row_offsets;
2269:     jj = *Ccsr->column_indices;
2270:     if (ciscompressed) d_i = c->compressedrow.i;
2271:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2272:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2273:   } else {
2274:     PetscInt *d_i = c->i;
2275:     if (ciscompressed) d_i = c->compressedrow.i;
2276:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2277:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2278:   }
2279:   if (ciscompressed) { /* need to expand host row offsets */
2280:     PetscInt r = 0;
2281:     c->i[0]    = 0;
2282:     for (k = 0; k < c->compressedrow.nrows; k++) {
2283:       const PetscInt next = c->compressedrow.rindex[k];
2284:       const PetscInt old  = c->compressedrow.i[k];
2285:       for (; r < next; r++) c->i[r + 1] = old;
2286:     }
2287:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2288:   }
2289:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2290:   PetscCall(PetscMalloc1(m, &c->ilen));
2291:   PetscCall(PetscMalloc1(m, &c->imax));
2292:   c->maxnz         = c->nz;
2293:   c->nonzerorowcnt = 0;
2294:   c->rmax          = 0;
2295:   for (k = 0; k < m; k++) {
2296:     const PetscInt nn = c->i[k + 1] - c->i[k];
2297:     c->ilen[k] = c->imax[k] = nn;
2298:     c->nonzerorowcnt += (PetscInt)!!nn;
2299:     c->rmax = PetscMax(c->rmax, nn);
2300:   }
2301:   PetscCall(PetscMalloc1(c->nz, &c->a));
2302:   Ccsr->num_entries = c->nz;

2304:   C->nonzerostate++;
2305:   PetscCall(PetscLayoutSetUp(C->rmap));
2306:   PetscCall(PetscLayoutSetUp(C->cmap));
2307:   Ccusp->nonzerostate = C->nonzerostate;
2308:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2309:   C->preallocated     = PETSC_TRUE;
2310:   C->assembled        = PETSC_FALSE;
2311:   C->was_assembled    = PETSC_FALSE;
2312:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2313:     mmdata->reusesym = PETSC_TRUE;
2314:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2315:   }
2316:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2317:   PetscFunctionReturn(PETSC_SUCCESS);
2318: }

2320: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2322: /* handles sparse or dense B */
2323: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2324: {
2325:   Mat_Product *product = mat->product;
2326:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2328:   PetscFunctionBegin;
2329:   MatCheckProduct(mat, 1);
2330:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2331:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
2332:   if (product->type == MATPRODUCT_ABC) {
2333:     Ciscusp = PETSC_FALSE;
2334:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
2335:   }
2336:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2337:     PetscBool usecpu = PETSC_FALSE;
2338:     switch (product->type) {
2339:     case MATPRODUCT_AB:
2340:       if (product->api_user) {
2341:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2342:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2343:         PetscOptionsEnd();
2344:       } else {
2345:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2346:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2347:         PetscOptionsEnd();
2348:       }
2349:       break;
2350:     case MATPRODUCT_AtB:
2351:       if (product->api_user) {
2352:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2353:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2354:         PetscOptionsEnd();
2355:       } else {
2356:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2357:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2358:         PetscOptionsEnd();
2359:       }
2360:       break;
2361:     case MATPRODUCT_PtAP:
2362:       if (product->api_user) {
2363:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2364:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2365:         PetscOptionsEnd();
2366:       } else {
2367:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2368:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2369:         PetscOptionsEnd();
2370:       }
2371:       break;
2372:     case MATPRODUCT_RARt:
2373:       if (product->api_user) {
2374:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2375:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2376:         PetscOptionsEnd();
2377:       } else {
2378:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2379:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2380:         PetscOptionsEnd();
2381:       }
2382:       break;
2383:     case MATPRODUCT_ABC:
2384:       if (product->api_user) {
2385:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2386:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2387:         PetscOptionsEnd();
2388:       } else {
2389:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2390:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2391:         PetscOptionsEnd();
2392:       }
2393:       break;
2394:     default:
2395:       break;
2396:     }
2397:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2398:   }
2399:   /* dispatch */
2400:   if (isdense) {
2401:     switch (product->type) {
2402:     case MATPRODUCT_AB:
2403:     case MATPRODUCT_AtB:
2404:     case MATPRODUCT_ABt:
2405:     case MATPRODUCT_PtAP:
2406:     case MATPRODUCT_RARt:
2407:       if (product->A->boundtocpu) {
2408:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2409:       } else {
2410:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2411:       }
2412:       break;
2413:     case MATPRODUCT_ABC:
2414:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2415:       break;
2416:     default:
2417:       break;
2418:     }
2419:   } else if (Biscusp && Ciscusp) {
2420:     switch (product->type) {
2421:     case MATPRODUCT_AB:
2422:     case MATPRODUCT_AtB:
2423:     case MATPRODUCT_ABt:
2424:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2425:       break;
2426:     case MATPRODUCT_PtAP:
2427:     case MATPRODUCT_RARt:
2428:     case MATPRODUCT_ABC:
2429:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2430:       break;
2431:     default:
2432:       break;
2433:     }
2434:   } else { /* fallback for AIJ */
2435:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2436:   }
2437:   PetscFunctionReturn(PETSC_SUCCESS);
2438: }

2440: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2441: {
2442:   PetscFunctionBegin;
2443:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2444:   PetscFunctionReturn(PETSC_SUCCESS);
2445: }

2447: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2448: {
2449:   PetscFunctionBegin;
2450:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2451:   PetscFunctionReturn(PETSC_SUCCESS);
2452: }

2454: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2455: {
2456:   PetscFunctionBegin;
2457:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2458:   PetscFunctionReturn(PETSC_SUCCESS);
2459: }

2461: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2462: {
2463:   PetscFunctionBegin;
2464:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2465:   PetscFunctionReturn(PETSC_SUCCESS);
2466: }

2468: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2469: {
2470:   PetscFunctionBegin;
2471:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2472:   PetscFunctionReturn(PETSC_SUCCESS);
2473: }

2475: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2476: {
2477:   int i = blockIdx.x * blockDim.x + threadIdx.x;
2478:   if (i < n) y[idx[i]] += x[i];
2479: }

2481: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2482: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2483: {
2484:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2485:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2486:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2487:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
2488:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2489:   PetscBool                     compressed;
2490:   PetscInt                      nx, ny;

2492:   PetscFunctionBegin;
2493:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2494:   if (!a->nz) {
2495:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
2496:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
2497:     PetscFunctionReturn(PETSC_SUCCESS);
2498:   }
2499:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2500:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2501:   if (!trans) {
2502:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2503:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2504:   } else {
2505:     if (herm || !A->form_explicit_transpose) {
2506:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2507:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2508:     } else {
2509:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2510:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
2511:     }
2512:   }
2513:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2514:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2516:   try {
2517:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
2518:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2519:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

2521:     PetscCall(PetscLogGpuTimeBegin());
2522:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2523:       /* z = A x + beta y.
2524:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2525:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2526:       */
2527:       xptr = xarray;
2528:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2529:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2530:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2531:           allocated to accommodate different uses. So we get the length info directly from mat.
2532:        */
2533:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2534:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2535:         nx             = mat->num_cols; // since y = Ax
2536:         ny             = mat->num_rows;
2537:       }
2538:     } else {
2539:       /* z = A^T x + beta y
2540:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2541:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2542:        */
2543:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2544:       dptr = zarray;
2545:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2546:       if (compressed) { /* Scatter x to work vector */
2547:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

2549:         thrust::for_each(
2550: #if PetscDefined(HAVE_THRUST_ASYNC)
2551:           thrust::cuda::par.on(PetscDefaultCudaStream),
2552: #endif
2553:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2554:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
2555:       }
2556:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2557:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2558:         nx             = mat->num_rows; // since y = A^T x
2559:         ny             = mat->num_cols;
2560:       }
2561:     }

2563:     /* csr_spmv does y = alpha op(A) x + beta y */
2564:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2565: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2566:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
2567: #else
2568:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
2569: #endif

2571:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2572: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2573:       if (!matDescr) {
2574:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2575:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2576:       }
2577: #endif

2579:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2580:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
2581:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
2582:         PetscCallCUSPARSE(
2583:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
2584:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
2585: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
2586:         PetscCallCUSPARSE(
2587:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2588: #endif
2589:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2590:       } else {
2591:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2592:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
2593:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
2594:       }

2596:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));

2598:     } else {
2599:       if (cusparsestruct->nrows) {
2600:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2601:       }
2602:     }
2603:     PetscCall(PetscLogGpuTimeEnd());

2605:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2606:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
2607:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2608:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
2609:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2610:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2611:         }
2612:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2613:         PetscCall(VecSeq_CUDA::Set(zz, 0));
2614:       }

2616:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2617:       if (compressed) {
2618:         PetscCall(PetscLogGpuTimeBegin());
2619:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
2620:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
2621:         PetscCall(PetscLogGpuTimeEnd());
2622:       }
2623:     } else {
2624:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2625:     }
2626:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
2627:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
2628:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
2629:   } catch (char *ex) {
2630:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2631:   }
2632:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2633:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2634:   PetscFunctionReturn(PETSC_SUCCESS);
2635: }

2637: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2638: {
2639:   PetscFunctionBegin;
2640:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2641:   PetscFunctionReturn(PETSC_SUCCESS);
2642: }

2644: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
2645: {
2646:   PetscFunctionBegin;
2647:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetDiagonal(A, diag));
2648:   PetscFunctionReturn(PETSC_SUCCESS);
2649: }

2651: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec ll, Vec rr)
2652: {
2653:   PetscFunctionBegin;
2654:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2655:   PetscFunctionReturn(PETSC_SUCCESS);
2656: }

2658: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
2659: {
2660:   PetscFunctionBegin;
2661:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode));
2662:   PetscFunctionReturn(PETSC_SUCCESS);
2663: }

2665: /*@
2666:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs

2668:   Collective

2670:   Input Parameters:
2671: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2672: . m    - number of rows
2673: . n    - number of columns
2674: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
2675: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

2677:   Output Parameter:
2678: . A - the matrix

2680:   Level: intermediate

2682:   Notes:
2683:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
2684:   calculations. For good matrix assembly performance the user should preallocate the matrix
2685:   storage by setting the parameter `nz` (or the array `nnz`).

2687:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2688:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
2689:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

2691:   The AIJ format, also called
2692:   compressed row storage, is fully compatible with standard Fortran
2693:   storage.  That is, the stored row and column indices can begin at
2694:   either one (as in Fortran) or zero.

2696:   Specify the preallocated storage with either nz or nnz (not both).
2697:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2698:   allocation.

2700:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

2702: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
2703:           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
2704: @*/
2705: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2706: {
2707:   return MatSeqAIJCUSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2708: }

2710: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2711: {
2712:   return MatSeqAIJCUSPARSE_CUPM_t::Destroy(A);
2713: }

2715: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2716: {
2717:   PetscFunctionBegin;
2718:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2719:   PetscFunctionReturn(PETSC_SUCCESS);
2720: }

2722: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2723: {
2724:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2725:   Mat_SeqAIJCUSPARSE *cy;
2726:   Mat_SeqAIJCUSPARSE *cx;
2727:   PetscScalar        *ay;
2728:   const PetscScalar  *ax;
2729:   CsrMatrix          *csry, *csrx;

2731:   PetscFunctionBegin;
2732:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
2733:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
2734:   if (X->ops->axpy != Y->ops->axpy) {
2735:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2736:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2737:     PetscFunctionReturn(PETSC_SUCCESS);
2738:   }
2739:   /* if we are here, it means both matrices are bound to GPU */
2740:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
2741:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
2742:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2743:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2744:   csry = (CsrMatrix *)cy->mat->mat;
2745:   csrx = (CsrMatrix *)cx->mat->mat;
2746:   /* see if we can turn this into a cublas axpy */
2747:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2748:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2749:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2750:     if (eq) str = SAME_NONZERO_PATTERN;
2751:   }
2752:   /* spgeam is buggy with one column */
2753:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

2755:   if (str == SUBSET_NONZERO_PATTERN) {
2756:     PetscScalar b = 1.0;
2757:     size_t      bufferSize;
2758:     void       *buffer;

2760:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
2761:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
2762:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
2763:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2764:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2765:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
2766:     PetscCall(PetscLogGpuTimeBegin());
2767:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2768:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2769:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2770:     PetscCall(PetscLogGpuTimeEnd());
2771:     PetscCallCUDA(cudaFree(buffer));

2773:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
2774:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
2775:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
2776:   } else if (str == SAME_NONZERO_PATTERN) {
2777:     PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2778:   } else {
2779:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2780:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2781:   }
2782:   PetscFunctionReturn(PETSC_SUCCESS);
2783: }

2785: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
2786: {
2787:   PetscFunctionBegin;
2788:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Scale(Y, a));
2789:   PetscFunctionReturn(PETSC_SUCCESS);
2790: }

2792: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2793: {
2794:   PetscFunctionBegin;
2795:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::ZeroEntries(A));
2796:   PetscFunctionReturn(PETSC_SUCCESS);
2797: }

2799: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(Mat A, PetscMemType *m)
2800: {
2801:   PetscFunctionBegin;
2802:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetCurrentMemType(A, m));
2803:   PetscFunctionReturn(PETSC_SUCCESS);
2804: }

2806: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
2807: {
2808:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

2810:   PetscFunctionBegin;
2811:   if (A->factortype != MAT_FACTOR_NONE) {
2812:     A->boundtocpu = flg;
2813:     PetscFunctionReturn(PETSC_SUCCESS);
2814:   }
2815:   if (flg) {
2816:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

2818:     A->ops->scale                     = MatScale_SeqAIJ;
2819:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
2820:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJ;
2821:     A->ops->axpy                      = MatAXPY_SeqAIJ;
2822:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2823:     A->ops->mult                      = MatMult_SeqAIJ;
2824:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2825:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2826:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2827:     A->ops->multhermitiantranspose    = NULL;
2828:     A->ops->multhermitiantransposeadd = NULL;
2829:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
2830:     A->ops->getcurrentmemtype         = NULL;
2831:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2832:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2833:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
2834:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
2835:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2836:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2837:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
2838:   } else {
2839:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
2840:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
2841:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJCUSPARSE;
2842:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
2843:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
2844:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2845:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2846:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2847:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2848:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2849:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2850:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
2851:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
2852:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
2853:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
2854:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
2855:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
2856:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
2857:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
2858:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

2860:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
2861:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2862:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2863:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
2864:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
2865:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2866:   }
2867:   A->boundtocpu = flg;
2868:   if (flg && a->inode.size_csr) {
2869:     a->inode.use = PETSC_TRUE;
2870:   } else {
2871:     a->inode.use = PETSC_FALSE;
2872:   }
2873:   PetscFunctionReturn(PETSC_SUCCESS);
2874: }

2876: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
2877: {
2878:   Mat B;

2880:   PetscFunctionBegin;
2881:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
2882:   if (reuse == MAT_INITIAL_MATRIX) {
2883:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2884:   } else if (reuse == MAT_REUSE_MATRIX) {
2885:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2886:   }
2887:   B = *newmat;

2889:   PetscCall(PetscFree(B->defaultvectype));
2890:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

2892:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2893:     if (B->factortype == MAT_FACTOR_NONE) {
2894:       Mat_SeqAIJCUSPARSE *spptr;
2895:       PetscCall(PetscNew(&spptr));
2896:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2897:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2898:       spptr->format     = MAT_CUSPARSE_CSR;
2899:       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
2900:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2901:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
2902:       B->spptr          = spptr;
2903:     } else {
2904:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2906:       PetscCall(PetscNew(&spptr));
2907:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2908:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2909:       B->spptr = spptr;
2910:     }
2911:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2912:   }
2913:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
2914:   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
2915:   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
2916:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
2917:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
2918:   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
2919:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;

2921:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
2922:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
2923:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
2924: #if defined(PETSC_HAVE_HYPRE)
2925:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
2926: #endif
2927:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
2928:   PetscFunctionReturn(PETSC_SUCCESS);
2929: }

2931: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2932: {
2933:   PetscFunctionBegin;
2934:   PetscCall(MatCreate_SeqAIJ(B));
2935:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
2936:   PetscFunctionReturn(PETSC_SUCCESS);
2937: }

2939: /*MC
2940:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

2942:    Options Database Keys:
2943: +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
2944: .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2945:                                            Other options include ell (ellpack) or hyb (hybrid).
2946: .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2947: -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU

2949:   Level: beginner

2951:   Notes:
2952:   These matrices can be in either CSR, ELL, or HYB format.

2954:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

2956:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
2957:   if some integer values passed in do not fit in `int`.

2959: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2960: M*/

2962: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2963: {
2964:   PetscFunctionBegin;
2965:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
2966:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
2967:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
2968:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
2969:   PetscFunctionReturn(PETSC_SUCCESS);
2970: }

2972: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
2973: {
2974:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

2976:   PetscFunctionBegin;
2977:   if (cusp) {
2978:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2979:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2980:     delete cusp->workVector;
2981:     delete cusp->rowoffsets_gpu;
2982:     delete cusp->csr2csc_i;
2983:     delete cusp->coords;
2984:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
2985:     PetscCall(PetscFree(mat->spptr));
2986:   }
2987:   PetscFunctionReturn(PETSC_SUCCESS);
2988: }

2990: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2991: {
2992:   PetscFunctionBegin;
2993:   if (*mat) {
2994:     delete (*mat)->values;
2995:     delete (*mat)->column_indices;
2996:     delete (*mat)->row_offsets;
2997:     delete *mat;
2998:     *mat = 0;
2999:   }
3000:   PetscFunctionReturn(PETSC_SUCCESS);
3001: }

3003: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3004: {
3005:   CsrMatrix *mat;

3007:   PetscFunctionBegin;
3008:   if (*matstruct) {
3009:     if ((*matstruct)->mat) {
3010:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3011:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3012:       } else {
3013:         mat = (CsrMatrix *)(*matstruct)->mat;
3014:         PetscCall(CsrMatrix_Destroy(&mat));
3015:       }
3016:     }
3017:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3018:     delete (*matstruct)->cprowIndices;
3019:     PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3020:     PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3021:     PetscCallCUDA(cudaFree((*matstruct)->beta_one));

3023:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3024:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

3026:     for (int i = 0; i < 3; i++) {
3027:       if (mdata->cuSpMV[i].initialized) {
3028:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3029:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3030:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3031: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3032:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
3033:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
3034: #endif
3035:       }
3036:     }
3037:     delete *matstruct;
3038:     *matstruct = NULL;
3039:   }
3040:   PetscFunctionReturn(PETSC_SUCCESS);
3041: }

3043: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3044: {
3045:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

3047:   PetscFunctionBegin;
3048:   if (fs) {
3049:     delete fs->rpermIndices;
3050:     delete fs->cpermIndices;
3051:     fs->rpermIndices  = NULL;
3052:     fs->cpermIndices  = NULL;
3053:     fs->init_dev_prop = PETSC_FALSE;
3054:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3055:     PetscCallCUDA(cudaFree(fs->csrColIdx));
3056:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
3057:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
3058:     PetscCallCUDA(cudaFree(fs->csrVal));
3059:     PetscCallCUDA(cudaFree(fs->diag));
3060:     PetscCallCUDA(cudaFree(fs->X));
3061:     PetscCallCUDA(cudaFree(fs->Y));
3062:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3063:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3064:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3065:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3066:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3067:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3068:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3069:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3070:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3071:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3072:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3073:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3074:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3075:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3076:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3077:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3078:     PetscCall(PetscFree(fs->csrRowPtr_h));
3079:     PetscCall(PetscFree(fs->csrVal_h));
3080:     PetscCall(PetscFree(fs->diag_h));
3081:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3082:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3083:   }
3084:   PetscFunctionReturn(PETSC_SUCCESS);
3085: }

3087: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3088: {
3089:   PetscFunctionBegin;
3090:   if (*trifactors) {
3091:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3092:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3093:     PetscCall(PetscFree(*trifactors));
3094:   }
3095:   PetscFunctionReturn(PETSC_SUCCESS);
3096: }

3098: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3099: {
3100:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

3102:   PetscFunctionBegin;
3103:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3104:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3105:   if (destroy) {
3106:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3107:     delete cusp->csr2csc_i;
3108:     cusp->csr2csc_i = NULL;
3109:   }
3110:   A->transupdated = PETSC_FALSE;
3111:   PetscFunctionReturn(PETSC_SUCCESS);
3112: }

3114: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3115: {
3116:   PetscFunctionBegin;
3117:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3118:   PetscFunctionReturn(PETSC_SUCCESS);
3119: }

3121: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3122: {
3123:   PetscFunctionBegin;
3124:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3125:   PetscFunctionReturn(PETSC_SUCCESS);
3126: }

3128: /*@C
3129:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

3131:   Not Collective

3133:   Input Parameters:
3134: + A          - the matrix
3135: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

3137:   Output Parameters:
3138: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
3139: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`

3141:   Level: developer

3143:   Note:
3144:   When compressed is true, the CSR structure does not contain empty rows

3146: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
3147: @*/
3148: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3149: {
3150:   PetscFunctionBegin;
3151:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3152:   PetscFunctionReturn(PETSC_SUCCESS);
3153: }

3155: /*@C
3156:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

3158:   Not Collective

3160:   Input Parameters:
3161: + A          - the matrix
3162: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3163: . i          - the CSR row pointers
3164: - j          - the CSR column indices

3166:   Level: developer

3168: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
3169: @*/
3170: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3171: {
3172:   PetscFunctionBegin;
3173:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3174:   PetscFunctionReturn(PETSC_SUCCESS);
3175: }

3177: /*@C
3178:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored

3180:   Not Collective

3182:   Input Parameter:
3183: . A - a `MATSEQAIJCUSPARSE` matrix

3185:   Output Parameter:
3186: . a - pointer to the device data

3188:   Level: developer

3190:   Note:
3191:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

3193: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
3194: @*/
3195: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3196: {
3197:   return MatSeqAIJCUSPARSE_CUPM_t::GetArrayRead(A, a);
3198: }

3200: /*@C
3201:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

3203:   Not Collective

3205:   Input Parameters:
3206: + A - a `MATSEQAIJCUSPARSE` matrix
3207: - a - pointer to the device data

3209:   Level: developer

3211: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
3212: @*/
3213: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3214: {
3215:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayRead(A, a);
3216: }

3218: /*@C
3219:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

3221:   Not Collective

3223:   Input Parameter:
3224: . A - a `MATSEQAIJCUSPARSE` matrix

3226:   Output Parameter:
3227: . a - pointer to the device data

3229:   Level: developer

3231:   Note:
3232:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

3234: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
3235: @*/
3236: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
3237: {
3238:   return MatSeqAIJCUSPARSE_CUPM_t::GetArray(A, a);
3239: }
3240: /*@C
3241:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

3243:   Not Collective

3245:   Input Parameters:
3246: + A - a `MATSEQAIJCUSPARSE` matrix
3247: - a - pointer to the device data

3249:   Level: developer

3251: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
3252: @*/
3253: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
3254: {
3255:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArray(A, a);
3256: }

3258: /*@C
3259:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

3261:   Not Collective

3263:   Input Parameter:
3264: . A - a `MATSEQAIJCUSPARSE` matrix

3266:   Output Parameter:
3267: . a - pointer to the device data

3269:   Level: developer

3271:   Note:
3272:   Does not trigger any host to device copies.

3274:   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current

3276: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
3277: @*/
3278: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
3279: {
3280:   return MatSeqAIJCUSPARSE_CUPM_t::GetArrayWrite(A, a);
3281: }

3283: /*@C
3284:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

3286:   Not Collective

3288:   Input Parameters:
3289: + A - a `MATSEQAIJCUSPARSE` matrix
3290: - a - pointer to the device data

3292:   Level: developer

3294: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
3295: @*/
3296: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
3297: {
3298:   return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3299: }

3301: struct IJCompare4 {
3302:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3303:   {
3304:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
3305:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
3306:     return false;
3307:   }
3308: };

3310: struct Shift {
3311:   int _shift;

3313:   Shift(int shift) : _shift(shift) { }
3314:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
3315: };

3317: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3318: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3319: {
3320:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3321:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
3322:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3323:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3324:   PetscInt                      Annz, Bnnz;
3325:   cusparseStatus_t              stat;
3326:   PetscInt                      i, m, n, zero = 0;

3328:   PetscFunctionBegin;
3331:   PetscAssertPointer(C, 4);
3332:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3333:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
3334:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3335:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3336:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3337:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3338:   if (reuse == MAT_INITIAL_MATRIX) {
3339:     m = A->rmap->n;
3340:     n = A->cmap->n + B->cmap->n;
3341:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
3342:     PetscCall(MatSetSizes(*C, m, n, m, n));
3343:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
3344:     c                       = (Mat_SeqAIJ *)(*C)->data;
3345:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3346:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
3347:     Ccsr                    = new CsrMatrix;
3348:     Cmat->cprowIndices      = NULL;
3349:     c->compressedrow.use    = PETSC_FALSE;
3350:     c->compressedrow.nrows  = 0;
3351:     c->compressedrow.i      = NULL;
3352:     c->compressedrow.rindex = NULL;
3353:     Ccusp->workVector       = NULL;
3354:     Ccusp->nrows            = m;
3355:     Ccusp->mat              = Cmat;
3356:     Ccusp->mat->mat         = Ccsr;
3357:     Ccsr->num_rows          = m;
3358:     Ccsr->num_cols          = n;
3359:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3360:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3361:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3362:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3363:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3364:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3365:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3366:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3367:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3368:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3369:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3370:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3371:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

3373:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
3374:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
3375:     Annz                 = (PetscInt)Acsr->column_indices->size();
3376:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
3377:     c->nz                = Annz + Bnnz;
3378:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
3379:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3380:     Ccsr->values         = new THRUSTARRAY(c->nz);
3381:     Ccsr->num_entries    = c->nz;
3382:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
3383:     if (c->nz) {
3384:       auto              Acoo = new THRUSTINTARRAY32(Annz);
3385:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
3386:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
3387:       THRUSTINTARRAY32 *Aroff, *Broff;

3389:       if (a->compressedrow.use) { /* need full row offset */
3390:         if (!Acusp->rowoffsets_gpu) {
3391:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3392:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3393:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3394:         }
3395:         Aroff = Acusp->rowoffsets_gpu;
3396:       } else Aroff = Acsr->row_offsets;
3397:       if (b->compressedrow.use) { /* need full row offset */
3398:         if (!Bcusp->rowoffsets_gpu) {
3399:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3400:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3401:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3402:         }
3403:         Broff = Bcusp->rowoffsets_gpu;
3404:       } else Broff = Bcsr->row_offsets;
3405:       PetscCall(PetscLogGpuTimeBegin());
3406:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3407:       PetscCallCUSPARSE(stat);
3408:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3409:       PetscCallCUSPARSE(stat);
3410:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3411:       auto Aperm = thrust::make_constant_iterator(1);
3412:       auto Bperm = thrust::make_constant_iterator(0);
3413:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3414:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3415:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
3416:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3417:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3418:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
3419:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3420:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3421:       auto p1    = Ccusp->coords->begin();
3422:       auto p2    = Ccusp->coords->begin();
3423: #if CCCL_VERSION >= 3001000
3424:       cuda::std::advance(p2, Annz);
3425: #else
3426:       thrust::advance(p2, Annz);
3427: #endif
3428:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
3429:       auto cci = thrust::make_counting_iterator(zero);
3430:       auto cce = thrust::make_counting_iterator(c->nz);
3431: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
3432:       auto pred = thrust::identity<int>();
3433: #else
3434:       auto pred = cuda::std::identity();
3435: #endif
3436:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3437:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3438:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3439:       PetscCallCUSPARSE(stat);
3440:       PetscCall(PetscLogGpuTimeEnd());
3441:       delete wPerm;
3442:       delete Acoo;
3443:       delete Bcoo;
3444:       delete Ccoo;
3445:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3446:       PetscCallCUSPARSE(stat);
3447:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3448:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3449:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3450:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3451:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3452:         CsrMatrix                    *CcsrT = new CsrMatrix;
3453:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3454:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

3456:         (*C)->form_explicit_transpose = PETSC_TRUE;
3457:         (*C)->transupdated            = PETSC_TRUE;
3458:         Ccusp->rowoffsets_gpu         = NULL;
3459:         CmatT->cprowIndices           = NULL;
3460:         CmatT->mat                    = CcsrT;
3461:         CcsrT->num_rows               = n;
3462:         CcsrT->num_cols               = m;
3463:         CcsrT->num_entries            = c->nz;

3465:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
3466:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3467:         CcsrT->values         = new THRUSTARRAY(c->nz);

3469:         PetscCall(PetscLogGpuTimeBegin());
3470:         auto rT = CcsrT->row_offsets->begin();
3471:         if (AT) {
3472:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3473: #if CCCL_VERSION >= 3001000
3474:           cuda::std::advance(rT, -1);
3475: #else
3476:           thrust::advance(rT, -1);
3477: #endif
3478:         }
3479:         if (BT) {
3480:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3481:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3482:           thrust::copy(titb, tite, rT);
3483:         }
3484:         auto cT = CcsrT->column_indices->begin();
3485:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3486:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3487:         auto vT = CcsrT->values->begin();
3488:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3489:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3490:         PetscCall(PetscLogGpuTimeEnd());

3492:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
3493:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
3494:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3495:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3496:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3497:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3498:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3499:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3500:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3501:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3502:         PetscCallCUSPARSE(stat);
3503:         Ccusp->matTranspose = CmatT;
3504:       }
3505:     }

3507:     c->free_a = PETSC_TRUE;
3508:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3509:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3510:     c->free_ij = PETSC_TRUE;
3511:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3512:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3513:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
3514:       ii = *Ccsr->row_offsets;
3515:       jj = *Ccsr->column_indices;
3516:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3517:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3518:     } else {
3519:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3520:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3521:     }
3522:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3523:     PetscCall(PetscMalloc1(m, &c->ilen));
3524:     PetscCall(PetscMalloc1(m, &c->imax));
3525:     c->maxnz         = c->nz;
3526:     c->nonzerorowcnt = 0;
3527:     c->rmax          = 0;
3528:     for (i = 0; i < m; i++) {
3529:       const PetscInt nn = c->i[i + 1] - c->i[i];
3530:       c->ilen[i] = c->imax[i] = nn;
3531:       c->nonzerorowcnt += (PetscInt)!!nn;
3532:       c->rmax = PetscMax(c->rmax, nn);
3533:     }
3534:     PetscCall(PetscMalloc1(c->nz, &c->a));
3535:     (*C)->nonzerostate++;
3536:     PetscCall(PetscLayoutSetUp((*C)->rmap));
3537:     PetscCall(PetscLayoutSetUp((*C)->cmap));
3538:     Ccusp->nonzerostate = (*C)->nonzerostate;
3539:     (*C)->preallocated  = PETSC_TRUE;
3540:   } else {
3541:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3542:     c = (Mat_SeqAIJ *)(*C)->data;
3543:     if (c->nz) {
3544:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3545:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3546:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3547:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3548:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3549:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3550:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3551:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3552:       Acsr = (CsrMatrix *)Acusp->mat->mat;
3553:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3554:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3555:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3556:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3557:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3558:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3559:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3560:       auto pmid = Ccusp->coords->begin();
3561: #if CCCL_VERSION >= 3001000
3562:       cuda::std::advance(pmid, Acsr->num_entries);
3563: #else
3564:       thrust::advance(pmid, Acsr->num_entries);
3565: #endif
3566:       PetscCall(PetscLogGpuTimeBegin());
3567:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3568:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3569:       thrust::for_each(zibait, zieait, VecCUDAEquals());
3570:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3571:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3572:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
3573:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3574:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3575:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3576:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3577:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3578:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3579:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3580:         auto       vT    = CcsrT->values->begin();
3581:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3582:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3583:         (*C)->transupdated = PETSC_TRUE;
3584:       }
3585:       PetscCall(PetscLogGpuTimeEnd());
3586:     }
3587:   }
3588:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3589:   (*C)->assembled     = PETSC_TRUE;
3590:   (*C)->was_assembled = PETSC_FALSE;
3591:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
3592:   PetscFunctionReturn(PETSC_SUCCESS);
3593: }

3595: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3596: {
3597:   PetscFunctionBegin;
3598:   PetscCall(MatSeqAIJCUSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3599:   PetscFunctionReturn(PETSC_SUCCESS);
3600: }