Actual source code: aijcusparse.cu
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <petsc/private/vecimpl.h>
 12: #undef VecType
 13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 14: #include <thrust/adjacent_difference.h>
 15: #if PETSC_CPP_VERSION >= 14
 16:   #define PETSC_HAVE_THRUST_ASYNC 1
 17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
 18: #endif
 19: #include <thrust/iterator/constant_iterator.h>
 20: #include <thrust/remove.h>
 21: #include <thrust/sort.h>
 22: #include <thrust/unique.h>
 23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
 24:   #include <cuda/std/functional>
 25: #endif

 27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 29: /*
 30:   The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 31:   0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
 32: */
 33: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 34: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 35: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 36: #endif

 38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
 41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 42: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
 43: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 44: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 45: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 46: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 47: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 48: #endif
 49: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 50: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 51: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 52: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 53: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 54: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 55: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 56: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 57: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 58: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 60: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 61: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 62: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 63: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);

 65: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 66: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 68: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 69: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 70: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 72: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 73: {
 74:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

 76:   PetscFunctionBegin;
 77:   switch (op) {
 78:   case MAT_CUSPARSE_MULT:
 79:     cusparsestruct->format = format;
 80:     break;
 81:   case MAT_CUSPARSE_ALL:
 82:     cusparsestruct->format = format;
 83:     break;
 84:   default:
 85:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
 86:   }
 87:   PetscFunctionReturn(PETSC_SUCCESS);
 88: }

 90: /*@
 91:   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
 92:   operation. Only the `MatMult()` operation can use different GPU storage formats

 94:   Not Collective

 96:   Input Parameters:
 97: + A      - Matrix of type `MATSEQAIJCUSPARSE`
 98: . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
 99:            `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

102:   Level: intermediate

104: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105: @*/
106: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107: {
108:   PetscFunctionBegin;
110:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }

114: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115: {
116:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

118:   PetscFunctionBegin;
119:   cusparsestruct->use_cpu_solve = use_cpu;
120:   PetscFunctionReturn(PETSC_SUCCESS);
121: }

123: /*@
124:   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

126:   Input Parameters:
127: + A       - Matrix of type `MATSEQAIJCUSPARSE`
128: - use_cpu - set flag for using the built-in CPU `MatSolve()`

130:   Level: intermediate

132:   Note:
133:   The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135:   This method to specify if the solve is done on the CPU or GPU (GPU is the default).

137: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138: @*/
139: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140: {
141:   PetscFunctionBegin;
143:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144:   PetscFunctionReturn(PETSC_SUCCESS);
145: }

147: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148: {
149:   PetscFunctionBegin;
150:   switch (op) {
151:   case MAT_FORM_EXPLICIT_TRANSPOSE:
152:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154:     A->form_explicit_transpose = flg;
155:     break;
156:   default:
157:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158:     break;
159:   }
160:   PetscFunctionReturn(PETSC_SUCCESS);
161: }

163: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164: {
165:   MatCUSPARSEStorageFormat format;
166:   PetscBool                flg;
167:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

169:   PetscFunctionBegin;
170:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171:   if (A->factortype == MAT_FACTOR_NONE) {
172:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));

175:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177:     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178:     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183:     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184:   #else
185:     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186:   #endif
187:     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188:     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

190:     PetscCall(
191:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192:     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193: #endif
194:   }
195:   PetscOptionsHeadEnd();
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201: {
202:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
203:   PetscInt                      m  = A->rmap->n;
204:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206:   const MatScalar              *Aa = a->a;
207:   PetscInt                     *Mi, *Mj, Mnz;
208:   PetscScalar                  *Ma;

210:   PetscFunctionBegin;
211:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214:       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215:       PetscCall(PetscMalloc1(m + 1, &Mi));
216:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217:       PetscCall(PetscMalloc1(Mnz, &Ma));
218:       Mi[0] = 0;
219:       for (PetscInt i = 0; i < m; i++) {
220:         PetscInt llen = Ai[i + 1] - Ai[i];
221:         PetscInt ulen = Adiag[i] - Adiag[i + 1];
222:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
223:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
224:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225:         Mi[i + 1] = Mi[i] + llen + ulen;
226:       }
227:       // Copy M (L,U) from host to device
228:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));

234:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
240:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
241:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

243:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

247:       fillMode = CUSPARSE_FILL_MODE_UPPER;
248:       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

253:       // Allocate work vectors in SpSv
254:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

257:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

260:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

268:       // Record for reuse
269:       fs->csrRowPtr_h = Mi;
270:       fs->csrVal_h    = Ma;
271:       PetscCall(PetscFree(Mj));
272:     }
273:     // Copy the value
274:     Mi  = fs->csrRowPtr_h;
275:     Ma  = fs->csrVal_h;
276:     Mnz = Mi[m];
277:     for (PetscInt i = 0; i < m; i++) {
278:       PetscInt llen = Ai[i + 1] - Ai[i];
279:       PetscInt ulen = Adiag[i] - Adiag[i + 1];
280:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
281:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
282:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283:     }
284:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));

286:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287:     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288:       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291:     } else
292:   #endif
293:     {
294:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

297:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
299:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300:     }
301:   }
302:   PetscFunctionReturn(PETSC_SUCCESS);
303: }
304: #else
305: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306: {
307:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
308:   PetscInt                           n                  = A->rmap->n;
309:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
311:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
312:   const MatScalar                   *aa = a->a, *v;
313:   PetscInt                          *AiLo, *AjLo;
314:   PetscInt                           i, nz, nzLower, offset, rowOffset;

316:   PetscFunctionBegin;
317:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
319:     try {
320:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
321:       nzLower = n + ai[n] - ai[1];
322:       if (!loTriFactor) {
323:         PetscScalar *AALo;

325:         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));

327:         /* Allocate Space for the lower triangular matrix */
328:         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
329:         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));

331:         /* Fill the lower triangular matrix */
332:         AiLo[0]   = (PetscInt)0;
333:         AiLo[n]   = nzLower;
334:         AjLo[0]   = (PetscInt)0;
335:         AALo[0]   = (MatScalar)1.0;
336:         v         = aa;
337:         vi        = aj;
338:         offset    = 1;
339:         rowOffset = 1;
340:         for (i = 1; i < n; i++) {
341:           nz = ai[i + 1] - ai[i];
342:           /* additional 1 for the term on the diagonal */
343:           AiLo[i] = rowOffset;
344:           rowOffset += nz + 1;

346:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));

349:           offset += nz;
350:           AjLo[offset] = (PetscInt)i;
351:           AALo[offset] = (MatScalar)1.0;
352:           offset += 1;

354:           v += nz;
355:           vi += nz;
356:         }

358:         /* allocate space for the triangular factor information */
359:         PetscCall(PetscNew(&loTriFactor));
360:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361:         /* Create the matrix description */
362:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
363:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
364:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
365:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366:   #else
367:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368:   #endif
369:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
370:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

372:         /* set the operation */
373:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

375:         /* set the matrix */
376:         loTriFactor->csrMat              = new CsrMatrix;
377:         loTriFactor->csrMat->num_rows    = n;
378:         loTriFactor->csrMat->num_cols    = n;
379:         loTriFactor->csrMat->num_entries = nzLower;

381:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

384:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

387:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

390:         /* Create the solve analysis information */
391:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
393:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
394:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
395:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
396:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397:   #endif

399:         /* perform the solve analysis */
400:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
401:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
402:         PetscCallCUDA(WaitForCUDA());
403:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

405:         /* assign the pointer */
406:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
407:         loTriFactor->AA_h                                          = AALo;
408:         PetscCallCUDA(cudaFreeHost(AiLo));
409:         PetscCallCUDA(cudaFreeHost(AjLo));
410:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411:       } else { /* update values only */
412:         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413:         /* Fill the lower triangular matrix */
414:         loTriFactor->AA_h[0] = 1.0;
415:         v                    = aa;
416:         vi                   = aj;
417:         offset               = 1;
418:         for (i = 1; i < n; i++) {
419:           nz = ai[i + 1] - ai[i];
420:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421:           offset += nz;
422:           loTriFactor->AA_h[offset] = 1.0;
423:           offset += 1;
424:           v += nz;
425:         }
426:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
427:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428:       }
429:     } catch (char *ex) {
430:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431:     }
432:   }
433:   PetscFunctionReturn(PETSC_SUCCESS);
434: }

436: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437: {
438:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
439:   PetscInt                           n                  = A->rmap->n;
440:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
442:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
443:   const MatScalar                   *aa = a->a, *v;
444:   PetscInt                          *AiUp, *AjUp;
445:   PetscInt                           i, nz, nzUpper, offset;

447:   PetscFunctionBegin;
448:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
450:     try {
451:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
452:       nzUpper = adiag[0] - adiag[n];
453:       if (!upTriFactor) {
454:         PetscScalar *AAUp;

456:         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

458:         /* Allocate Space for the upper triangular matrix */
459:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
460:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

462:         /* Fill the upper triangular matrix */
463:         AiUp[0] = (PetscInt)0;
464:         AiUp[n] = nzUpper;
465:         offset  = nzUpper;
466:         for (i = n - 1; i >= 0; i--) {
467:           v  = aa + adiag[i + 1] + 1;
468:           vi = aj + adiag[i + 1] + 1;

470:           /* number of elements NOT on the diagonal */
471:           nz = adiag[i] - adiag[i + 1] - 1;

473:           /* decrement the offset */
474:           offset -= (nz + 1);

476:           /* first, set the diagonal elements */
477:           AjUp[offset] = (PetscInt)i;
478:           AAUp[offset] = (MatScalar)1. / v[nz];
479:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

481:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
483:         }

485:         /* allocate space for the triangular factor information */
486:         PetscCall(PetscNew(&upTriFactor));
487:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

489:         /* Create the matrix description */
490:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
491:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
492:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
493:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494:   #else
495:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496:   #endif
497:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
498:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

500:         /* set the operation */
501:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

503:         /* set the matrix */
504:         upTriFactor->csrMat              = new CsrMatrix;
505:         upTriFactor->csrMat->num_rows    = n;
506:         upTriFactor->csrMat->num_cols    = n;
507:         upTriFactor->csrMat->num_entries = nzUpper;

509:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

512:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

515:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

518:         /* Create the solve analysis information */
519:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
521:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
522:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
523:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
524:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525:   #endif

527:         /* perform the solve analysis */
528:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
529:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

531:         PetscCallCUDA(WaitForCUDA());
532:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

534:         /* assign the pointer */
535:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
536:         upTriFactor->AA_h                                          = AAUp;
537:         PetscCallCUDA(cudaFreeHost(AiUp));
538:         PetscCallCUDA(cudaFreeHost(AjUp));
539:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540:       } else {
541:         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542:         /* Fill the upper triangular matrix */
543:         offset = nzUpper;
544:         for (i = n - 1; i >= 0; i--) {
545:           v = aa + adiag[i + 1] + 1;

547:           /* number of elements NOT on the diagonal */
548:           nz = adiag[i] - adiag[i + 1] - 1;

550:           /* decrement the offset */
551:           offset -= (nz + 1);

553:           /* first, set the diagonal elements */
554:           upTriFactor->AA_h[offset] = 1. / v[nz];
555:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556:         }
557:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
558:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559:       }
560:     } catch (char *ex) {
561:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562:     }
563:   }
564:   PetscFunctionReturn(PETSC_SUCCESS);
565: }
566: #endif

568: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569: {
570:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
571:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572:   IS                            isrow = a->row, isicol = a->icol;
573:   PetscBool                     row_identity, col_identity;
574:   PetscInt                      n = A->rmap->n;

576:   PetscFunctionBegin;
577:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580: #else
581:   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
582:   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584: #endif

586:   cusparseTriFactors->nnz = a->nz;

588:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589:   /* lower triangular indices */
590:   PetscCall(ISIdentity(isrow, &row_identity));
591:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
592:     const PetscInt *r;

594:     PetscCall(ISGetIndices(isrow, &r));
595:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596:     cusparseTriFactors->rpermIndices->assign(r, r + n);
597:     PetscCall(ISRestoreIndices(isrow, &r));
598:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599:   }

601:   /* upper triangular indices */
602:   PetscCall(ISIdentity(isicol, &col_identity));
603:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
604:     const PetscInt *c;

606:     PetscCall(ISGetIndices(isicol, &c));
607:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608:     cusparseTriFactors->cpermIndices->assign(c, c + n);
609:     PetscCall(ISRestoreIndices(isicol, &c));
610:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611:   }
612:   PetscFunctionReturn(PETSC_SUCCESS);
613: }

615: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
616: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
617: {
618:   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
619:   PetscInt                      m  = A->rmap->n;
620:   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621:   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622:   const MatScalar              *Aa = a->a;
623:   PetscInt                     *Mj, Mnz;
624:   PetscScalar                  *Ma, *D;

626:   PetscFunctionBegin;
627:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628:     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631:       Mnz = Ai[m]; // Unz (with the unit diagonal)
632:       PetscCall(PetscMalloc1(Mnz, &Ma));
633:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
635:       for (PetscInt i = 0; i < m; i++) {
636:         PetscInt ulen = Ai[i + 1] - Ai[i];
637:         Mj[Ai[i]]     = i;                                              // diagonal entry
638:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639:       }
640:       // Copy M (U) from host to device
641:       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642:       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643:       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644:       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645:       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646:       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));

648:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653:       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
654:       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655:       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;

657:       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659:       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

661:       // Allocate work vectors in SpSv
662:       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663:       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

665:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666:       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

668:       // Query buffer sizes for SpSV and then allocate buffers
669:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

673:       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674:       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675:       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

677:       // Record for reuse
678:       fs->csrVal_h = Ma;
679:       fs->diag_h   = D;
680:       PetscCall(PetscFree(Mj));
681:     }
682:     // Copy the value
683:     Ma  = fs->csrVal_h;
684:     D   = fs->diag_h;
685:     Mnz = Ai[m];
686:     for (PetscInt i = 0; i < m; i++) {
687:       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
688:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690:     }
691:     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692:     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));

694:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695:     if (fs->updatedSpSVAnalysis) {
696:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697:       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698:     } else
699:   #endif
700:     {
701:       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703:       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704:       fs->updatedSpSVAnalysis = PETSC_TRUE;
705:     }
706:   }
707:   PetscFunctionReturn(PETSC_SUCCESS);
708: }

710: // Solve Ut D U x = b
711: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712: {
713:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
715:   const PetscScalar                    *barray;
716:   PetscScalar                          *xarray;
717:   thrust::device_ptr<const PetscScalar> bGPU;
718:   thrust::device_ptr<PetscScalar>       xGPU;
719:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
720:   PetscInt                              m   = A->rmap->n;

722:   PetscFunctionBegin;
723:   PetscCall(PetscLogGpuTimeBegin());
724:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725:   PetscCall(VecCUDAGetArrayRead(b, &barray));
726:   xGPU = thrust::device_pointer_cast(xarray);
727:   bGPU = thrust::device_pointer_cast(barray);

729:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
730:   if (fs->rpermIndices) {
731:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733:   } else {
734:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735:   }

737:   // Solve Ut Y = X
738:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

741:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742:   // It is basically a vector element-wise multiplication, but cublas does not have it!
743:   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));

745:   // Solve U X = Y
746:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
747:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
748:   } else {
749:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
750:   }
751:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

753:   // Reorder X with the column permutation if needed, and put the result back to x
754:   if (fs->cpermIndices) {
755:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
756:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
757:   }

759:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
760:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
761:   PetscCall(PetscLogGpuTimeEnd());
762:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
763:   PetscFunctionReturn(PETSC_SUCCESS);
764: }
765: #else
766: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
767: {
768:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
769:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
770:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
771:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
772:   PetscInt                          *AiUp, *AjUp;
773:   PetscScalar                       *AAUp;
774:   PetscScalar                       *AALo;
775:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
776:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
777:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
778:   const MatScalar                   *aa = b->a, *v;

780:   PetscFunctionBegin;
781:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
782:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
783:     try {
784:       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
785:       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
786:       if (!upTriFactor && !loTriFactor) {
787:         /* Allocate Space for the upper triangular matrix */
788:         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
789:         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));

791:         /* Fill the upper triangular matrix */
792:         AiUp[0] = (PetscInt)0;
793:         AiUp[n] = nzUpper;
794:         offset  = 0;
795:         for (i = 0; i < n; i++) {
796:           /* set the pointers */
797:           v  = aa + ai[i];
798:           vj = aj + ai[i];
799:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

801:           /* first, set the diagonal elements */
802:           AjUp[offset] = (PetscInt)i;
803:           AAUp[offset] = (MatScalar)1.0 / v[nz];
804:           AiUp[i]      = offset;
805:           AALo[offset] = (MatScalar)1.0 / v[nz];

807:           offset += 1;
808:           if (nz > 0) {
809:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
810:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
811:             for (j = offset; j < offset + nz; j++) {
812:               AAUp[j] = -AAUp[j];
813:               AALo[j] = AAUp[j] / v[nz];
814:             }
815:             offset += nz;
816:           }
817:         }

819:         /* allocate space for the triangular factor information */
820:         PetscCall(PetscNew(&upTriFactor));
821:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

823:         /* Create the matrix description */
824:         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
825:         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
826:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
827:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828:   #else
829:         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830:   #endif
831:         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
832:         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));

834:         /* set the matrix */
835:         upTriFactor->csrMat              = new CsrMatrix;
836:         upTriFactor->csrMat->num_rows    = A->rmap->n;
837:         upTriFactor->csrMat->num_cols    = A->cmap->n;
838:         upTriFactor->csrMat->num_entries = a->nz;

840:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
841:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

843:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
844:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

846:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
847:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

849:         /* set the operation */
850:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

852:         /* Create the solve analysis information */
853:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
854:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
855:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
856:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
857:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
858:         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
859:   #endif

861:         /* perform the solve analysis */
862:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
863:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

865:         PetscCallCUDA(WaitForCUDA());
866:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

868:         /* assign the pointer */
869:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

871:         /* allocate space for the triangular factor information */
872:         PetscCall(PetscNew(&loTriFactor));
873:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

875:         /* Create the matrix description */
876:         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
877:         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
878:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
879:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
880:   #else
881:         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
882:   #endif
883:         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
884:         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));

886:         /* set the operation */
887:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

889:         /* set the matrix */
890:         loTriFactor->csrMat              = new CsrMatrix;
891:         loTriFactor->csrMat->num_rows    = A->rmap->n;
892:         loTriFactor->csrMat->num_cols    = A->cmap->n;
893:         loTriFactor->csrMat->num_entries = a->nz;

895:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
896:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

898:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
899:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

901:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
902:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

904:         /* Create the solve analysis information */
905:         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
906:         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
907:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
908:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
909:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
910:         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
911:   #endif

913:         /* perform the solve analysis */
914:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
915:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

917:         PetscCallCUDA(WaitForCUDA());
918:         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

920:         /* assign the pointer */
921:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

923:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
924:         PetscCallCUDA(cudaFreeHost(AiUp));
925:         PetscCallCUDA(cudaFreeHost(AjUp));
926:       } else {
927:         /* Fill the upper triangular matrix */
928:         offset = 0;
929:         for (i = 0; i < n; i++) {
930:           /* set the pointers */
931:           v  = aa + ai[i];
932:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

934:           /* first, set the diagonal elements */
935:           AAUp[offset] = 1.0 / v[nz];
936:           AALo[offset] = 1.0 / v[nz];

938:           offset += 1;
939:           if (nz > 0) {
940:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
941:             for (j = offset; j < offset + nz; j++) {
942:               AAUp[j] = -AAUp[j];
943:               AALo[j] = AAUp[j] / v[nz];
944:             }
945:             offset += nz;
946:           }
947:         }
948:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
949:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
950:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
951:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
952:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
953:       }
954:       PetscCallCUDA(cudaFreeHost(AAUp));
955:       PetscCallCUDA(cudaFreeHost(AALo));
956:     } catch (char *ex) {
957:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
958:     }
959:   }
960:   PetscFunctionReturn(PETSC_SUCCESS);
961: }
962: #endif

964: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
965: {
966:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
967:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
968:   IS                            ip                 = a->row;
969:   PetscBool                     perm_identity;
970:   PetscInt                      n = A->rmap->n;

972:   PetscFunctionBegin;
973:   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");

975: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
976:   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
977: #else
978:   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
979:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
980: #endif
981:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

983:   A->offloadmask = PETSC_OFFLOAD_BOTH;

985:   /* lower triangular indices */
986:   PetscCall(ISIdentity(ip, &perm_identity));
987:   if (!perm_identity) {
988:     IS              iip;
989:     const PetscInt *irip, *rip;

991:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
992:     PetscCall(ISGetIndices(iip, &irip));
993:     PetscCall(ISGetIndices(ip, &rip));
994:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
995:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
996:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
997:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
998:     PetscCall(ISRestoreIndices(iip, &irip));
999:     PetscCall(ISDestroy(&iip));
1000:     PetscCall(ISRestoreIndices(ip, &rip));
1001:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1002:   }
1003:   PetscFunctionReturn(PETSC_SUCCESS);
1004: }

1006: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1007: {
1008:   PetscFunctionBegin;
1009:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1010:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1011:   B->offloadmask = PETSC_OFFLOAD_CPU;

1013: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1014:   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1015:   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1016: #else
1017:   /* determine which version of MatSolve needs to be used. */
1018:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1019:   IS          ip = b->row;
1020:   PetscBool   perm_identity;

1022:   PetscCall(ISIdentity(ip, &perm_identity));
1023:   if (perm_identity) {
1024:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1025:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1026:   } else {
1027:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1028:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1029:   }
1030: #endif
1031:   B->ops->matsolve          = NULL;
1032:   B->ops->matsolvetranspose = NULL;

1034:   /* get the triangular factors */
1035:   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1036:   PetscFunctionReturn(PETSC_SUCCESS);
1037: }

1039: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1040: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1041: {
1042:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1043:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1044:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1045:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1046:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1047:   cusparseIndexBase_t                indexBase;
1048:   cusparseMatrixType_t               matrixType;
1049:   cusparseFillMode_t                 fillMode;
1050:   cusparseDiagType_t                 diagType;

1052:   PetscFunctionBegin;
1053:   /* allocate space for the transpose of the lower triangular factor */
1054:   PetscCall(PetscNew(&loTriFactorT));
1055:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1057:   /* set the matrix descriptors of the lower triangular factor */
1058:   matrixType = cusparseGetMatType(loTriFactor->descr);
1059:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
1060:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1061:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

1063:   /* Create the matrix description */
1064:   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1065:   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1066:   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1067:   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1068:   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));

1070:   /* set the operation */
1071:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1073:   /* allocate GPU space for the CSC of the lower triangular factor*/
1074:   loTriFactorT->csrMat                 = new CsrMatrix;
1075:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1076:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1077:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1078:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1079:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1080:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

1082:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1083:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1084:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1085:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1086:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1087:   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1088:   #endif

1090:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1091:   {
1092:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1093:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1094:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1095:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1096:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1097:   #else
1098:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1099:   #endif
1100:     PetscCallCUSPARSE(stat);
1101:   }

1103:   PetscCallCUDA(WaitForCUDA());
1104:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1106:   /* Create the solve analysis information */
1107:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1108:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1109:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1110:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1111:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1112:   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1113:   #endif

1115:   /* perform the solve analysis */
1116:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1117:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1119:   PetscCallCUDA(WaitForCUDA());
1120:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1122:   /* assign the pointer */
1123:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

1125:   /*********************************************/
1126:   /* Now the Transpose of the Upper Tri Factor */
1127:   /*********************************************/

1129:   /* allocate space for the transpose of the upper triangular factor */
1130:   PetscCall(PetscNew(&upTriFactorT));
1131:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

1133:   /* set the matrix descriptors of the upper triangular factor */
1134:   matrixType = cusparseGetMatType(upTriFactor->descr);
1135:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
1136:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1137:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

1139:   /* Create the matrix description */
1140:   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1141:   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1142:   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1143:   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1144:   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));

1146:   /* set the operation */
1147:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1149:   /* allocate GPU space for the CSC of the upper triangular factor*/
1150:   upTriFactorT->csrMat                 = new CsrMatrix;
1151:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1152:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1153:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1154:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1155:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1156:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1158:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1159:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1160:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1161:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1163:   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1164:   #endif

1166:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1167:   {
1168:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1169:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1170:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1171:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1172:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1173:   #else
1174:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1175:   #endif
1176:     PetscCallCUSPARSE(stat);
1177:   }

1179:   PetscCallCUDA(WaitForCUDA());
1180:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));

1182:   /* Create the solve analysis information */
1183:   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1184:   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1185:   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1186:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1187:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1188:   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1189:   #endif

1191:   /* perform the solve analysis */
1192:   /* christ, would it have killed you to put this stuff in a function????????? */
1193:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1194:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1196:   PetscCallCUDA(WaitForCUDA());
1197:   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));

1199:   /* assign the pointer */
1200:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1201:   PetscFunctionReturn(PETSC_SUCCESS);
1202: }
1203: #endif

1205: struct PetscScalarToPetscInt {
1206:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1207: };

1209: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1210: {
1211:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1212:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1213:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1214:   cusparseStatus_t              stat;
1215:   cusparseIndexBase_t           indexBase;

1217:   PetscFunctionBegin;
1218:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1219:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1220:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1221:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1222:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1223:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1224:   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1225:   PetscCall(PetscLogGpuTimeBegin());
1226:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1227:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1228:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1229:     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1230:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1231:     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1232:     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

1234:     /* set alpha and beta */
1235:     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1236:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1237:     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1238:     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1239:     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1240:     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));

1242:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1243:       CsrMatrix *matrixT      = new CsrMatrix;
1244:       matstructT->mat         = matrixT;
1245:       matrixT->num_rows       = A->cmap->n;
1246:       matrixT->num_cols       = A->rmap->n;
1247:       matrixT->num_entries    = a->nz;
1248:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1249:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1250:       matrixT->values         = new THRUSTARRAY(a->nz);

1252:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1253:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1255: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1256:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1257:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1258:                                indexBase, cusparse_scalartype);
1259:       PetscCallCUSPARSE(stat);
1260:   #else
1261:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1262:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1264:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1265:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1266:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1267:         */
1268:       if (matrixT->num_entries) {
1269:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1270:         PetscCallCUSPARSE(stat);

1272:       } else {
1273:         matstructT->matDescr = NULL;
1274:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1275:       }
1276:   #endif
1277: #endif
1278:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1280:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281: #else
1282:       CsrMatrix *temp  = new CsrMatrix;
1283:       CsrMatrix *tempT = new CsrMatrix;
1284:       /* First convert HYB to CSR */
1285:       temp->num_rows       = A->rmap->n;
1286:       temp->num_cols       = A->cmap->n;
1287:       temp->num_entries    = a->nz;
1288:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1289:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290:       temp->values         = new THRUSTARRAY(a->nz);

1292:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1293:       PetscCallCUSPARSE(stat);

1295:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296:       tempT->num_rows       = A->rmap->n;
1297:       tempT->num_cols       = A->cmap->n;
1298:       tempT->num_entries    = a->nz;
1299:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1300:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301:       tempT->values         = new THRUSTARRAY(a->nz);

1303:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1304:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1305:       PetscCallCUSPARSE(stat);

1307:       /* Last, convert CSC to HYB */
1308:       cusparseHybMat_t hybMat;
1309:       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1310:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311:       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1312:       PetscCallCUSPARSE(stat);

1314:       /* assign the pointer */
1315:       matstructT->mat = hybMat;
1316:       A->transupdated = PETSC_TRUE;
1317:       /* delete temporaries */
1318:       if (tempT) {
1319:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1320:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1321:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1322:         delete (CsrMatrix *)tempT;
1323:       }
1324:       if (temp) {
1325:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1326:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1327:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1328:         delete (CsrMatrix *)temp;
1329:       }
1330: #endif
1331:     }
1332:   }
1333:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1334:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1335:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1336:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1337:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1338:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1339:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1340:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1341:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1342:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1343:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1344:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1345:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1346:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1347:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1348:     }
1349:     if (!cusparsestruct->csr2csc_i) {
1350:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1351:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1353:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1354: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1355:       void  *csr2cscBuffer;
1356:       size_t csr2cscBufferSize;
1357:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1358:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1359:       PetscCallCUSPARSE(stat);
1360:       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1361: #endif

1363:       if (matrix->num_entries) {
1364:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1365:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1366:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1368:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1369:            should be filled with indexBase. So I just take a shortcut here.
1370:         */
1371:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1372: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1373:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1374:         PetscCallCUSPARSE(stat);
1375: #else
1376:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1377:         PetscCallCUSPARSE(stat);
1378: #endif
1379:       } else {
1380:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1381:       }

1383:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1385: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1386:       PetscCallCUDA(cudaFree(csr2cscBuffer));
1387: #endif
1388:     }
1389:     PetscCallThrust(
1390:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1391:   }
1392:   PetscCall(PetscLogGpuTimeEnd());
1393:   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1394:   /* the compressed row indices is not used for matTranspose */
1395:   matstructT->cprowIndices = NULL;
1396:   /* assign the pointer */
1397:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1398:   A->transupdated                                = PETSC_TRUE;
1399:   PetscFunctionReturn(PETSC_SUCCESS);
1400: }

1402: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1403: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1404: {
1405:   const PetscScalar                    *barray;
1406:   PetscScalar                          *xarray;
1407:   thrust::device_ptr<const PetscScalar> bGPU;
1408:   thrust::device_ptr<PetscScalar>       xGPU;
1409:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1410:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1411:   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1412:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1413:   PetscInt                              m   = A->rmap->n;

1415:   PetscFunctionBegin;
1416:   PetscCall(PetscLogGpuTimeBegin());
1417:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1418:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1419:   xGPU = thrust::device_pointer_cast(xarray);
1420:   bGPU = thrust::device_pointer_cast(barray);

1422:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1423:   if (fs->rpermIndices) {
1424:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1425:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1426:   } else {
1427:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1428:   }

1430:   // Solve L Y = X
1431:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1432:   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1433:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));

1435:   // Solve U X = Y
1436:   if (fs->cpermIndices) {
1437:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1438:   } else {
1439:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1440:   }
1441:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));

1443:   // Reorder X with the column permutation if needed, and put the result back to x
1444:   if (fs->cpermIndices) {
1445:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1446:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1447:   }
1448:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1449:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1450:   PetscCall(PetscLogGpuTimeEnd());
1451:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1452:   PetscFunctionReturn(PETSC_SUCCESS);
1453: }

1455: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1456: {
1457:   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1458:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1459:   const PetscScalar                    *barray;
1460:   PetscScalar                          *xarray;
1461:   thrust::device_ptr<const PetscScalar> bGPU;
1462:   thrust::device_ptr<PetscScalar>       xGPU;
1463:   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1464:   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1465:   PetscInt                              m   = A->rmap->n;

1467:   PetscFunctionBegin;
1468:   PetscCall(PetscLogGpuTimeBegin());
1469:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1470:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1471:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1472:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1474:     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1475:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1476:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1477:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1478:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1479:   }

1481:   if (!fs->updatedTransposeSpSVAnalysis) {
1482:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1484:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1485:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1486:   }

1488:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1489:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1490:   xGPU = thrust::device_pointer_cast(xarray);
1491:   bGPU = thrust::device_pointer_cast(barray);

1493:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1494:   if (fs->rpermIndices) {
1495:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1496:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1497:   } else {
1498:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1499:   }

1501:   // Solve Ut Y = X
1502:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1503:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));

1505:   // Solve Lt X = Y
1506:   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1507:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1508:   } else {
1509:     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1510:   }
1511:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));

1513:   // Reorder X with the column permutation if needed, and put the result back to x
1514:   if (fs->cpermIndices) {
1515:     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1516:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1517:   }

1519:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1520:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1521:   PetscCall(PetscLogGpuTimeEnd());
1522:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1523:   PetscFunctionReturn(PETSC_SUCCESS);
1524: }
1525: #else
1526: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1527: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1528: {
1529:   PetscInt                              n = xx->map->n;
1530:   const PetscScalar                    *barray;
1531:   PetscScalar                          *xarray;
1532:   thrust::device_ptr<const PetscScalar> bGPU;
1533:   thrust::device_ptr<PetscScalar>       xGPU;
1534:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1535:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1536:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1537:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1539:   PetscFunctionBegin;
1540:   /* Analyze the matrix and create the transpose ... on the fly */
1541:   if (!loTriFactorT && !upTriFactorT) {
1542:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1543:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545:   }

1547:   /* Get the GPU pointers */
1548:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1549:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1550:   xGPU = thrust::device_pointer_cast(xarray);
1551:   bGPU = thrust::device_pointer_cast(barray);

1553:   PetscCall(PetscLogGpuTimeBegin());
1554:   /* First, reorder with the row permutation */
1555:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1557:   /* First, solve U */
1558:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1559:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1561:   /* Then, solve L */
1562:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1563:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1565:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1566:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1568:   /* Copy the temporary to the full solution. */
1569:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1571:   /* restore */
1572:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1573:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1574:   PetscCall(PetscLogGpuTimeEnd());
1575:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1576:   PetscFunctionReturn(PETSC_SUCCESS);
1577: }

1579: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1580: {
1581:   const PetscScalar                 *barray;
1582:   PetscScalar                       *xarray;
1583:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1584:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1585:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1586:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1588:   PetscFunctionBegin;
1589:   /* Analyze the matrix and create the transpose ... on the fly */
1590:   if (!loTriFactorT && !upTriFactorT) {
1591:     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1592:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594:   }

1596:   /* Get the GPU pointers */
1597:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1598:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1600:   PetscCall(PetscLogGpuTimeBegin());
1601:   /* First, solve U */
1602:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1603:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1605:   /* Then, solve L */
1606:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1607:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1609:   /* restore */
1610:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1611:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1612:   PetscCall(PetscLogGpuTimeEnd());
1613:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1614:   PetscFunctionReturn(PETSC_SUCCESS);
1615: }

1617: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1618: {
1619:   const PetscScalar                    *barray;
1620:   PetscScalar                          *xarray;
1621:   thrust::device_ptr<const PetscScalar> bGPU;
1622:   thrust::device_ptr<PetscScalar>       xGPU;
1623:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1624:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1625:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1626:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1628:   PetscFunctionBegin;
1629:   /* Get the GPU pointers */
1630:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1631:   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1632:   xGPU = thrust::device_pointer_cast(xarray);
1633:   bGPU = thrust::device_pointer_cast(barray);

1635:   PetscCall(PetscLogGpuTimeBegin());
1636:   /* First, reorder with the row permutation */
1637:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1639:   /* Next, solve L */
1640:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1641:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1643:   /* Then, solve U */
1644:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1645:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1647:   /* Last, reorder with the column permutation */
1648:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1650:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1651:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1652:   PetscCall(PetscLogGpuTimeEnd());
1653:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1654:   PetscFunctionReturn(PETSC_SUCCESS);
1655: }

1657: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1658: {
1659:   const PetscScalar                 *barray;
1660:   PetscScalar                       *xarray;
1661:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1662:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1663:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1664:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1666:   PetscFunctionBegin;
1667:   /* Get the GPU pointers */
1668:   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1669:   PetscCall(VecCUDAGetArrayRead(bb, &barray));

1671:   PetscCall(PetscLogGpuTimeBegin());
1672:   /* First, solve L */
1673:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1674:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1676:   /* Next, solve U */
1677:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1678:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1680:   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1681:   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1682:   PetscCall(PetscLogGpuTimeEnd());
1683:   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1684:   PetscFunctionReturn(PETSC_SUCCESS);
1685: }
1686: #endif

1688: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1689: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1690: {
1691:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1692:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1693:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1694:   CsrMatrix                    *Acsr;
1695:   PetscInt                      m, nz;
1696:   PetscBool                     flg;

1698:   PetscFunctionBegin;
1699:   if (PetscDefined(USE_DEBUG)) {
1700:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1701:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1702:   }

1704:   /* Copy A's value to fact */
1705:   m  = fact->rmap->n;
1706:   nz = aij->nz;
1707:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1708:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1709:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1711:   PetscCall(PetscLogGpuTimeBegin());
1712:   /* Factorize fact inplace */
1713:   if (m)
1714:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1715:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1716:   if (PetscDefined(USE_DEBUG)) {
1717:     int              numerical_zero;
1718:     cusparseStatus_t status;
1719:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1720:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1721:   }

1723:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1724:   if (fs->updatedSpSVAnalysis) {
1725:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1726:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1727:   } else
1728:   #endif
1729:   {
1730:     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1731:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1732:     */
1733:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1735:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1737:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1738:     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1739:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1740:   }

1742:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1743:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1744:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1745:   fact->ops->matsolve          = NULL;
1746:   fact->ops->matsolvetranspose = NULL;
1747:   PetscCall(PetscLogGpuTimeEnd());
1748:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1749:   PetscFunctionReturn(PETSC_SUCCESS);
1750: }

1752: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1753: {
1754:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1755:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1756:   PetscInt                      m, nz;

1758:   PetscFunctionBegin;
1759:   if (PetscDefined(USE_DEBUG)) {
1760:     PetscInt  i;
1761:     PetscBool flg, missing;

1763:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1764:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1765:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1766:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1767:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1768:   }

1770:   /* Free the old stale stuff */
1771:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

1773:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1774:      but they will not be used. Allocate them just for easy debugging.
1775:    */
1776:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1778:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1779:   fact->factortype             = MAT_FACTOR_ILU;
1780:   fact->info.factor_mallocs    = 0;
1781:   fact->info.fill_ratio_given  = info->fill;
1782:   fact->info.fill_ratio_needed = 1.0;

1784:   aij->row = NULL;
1785:   aij->col = NULL;

1787:   /* ====================================================================== */
1788:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1789:   /* We'll do in-place factorization on fact                                */
1790:   /* ====================================================================== */
1791:   const int *Ai, *Aj;

1793:   m  = fact->rmap->n;
1794:   nz = aij->nz;

1796:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1797:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1798:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1799:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1800:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1803:   /* ====================================================================== */
1804:   /* Create descriptors for M, L, U                                         */
1805:   /* ====================================================================== */
1806:   cusparseFillMode_t fillMode;
1807:   cusparseDiagType_t diagType;

1809:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1810:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1811:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

1813:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1814:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1815:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1816:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1817:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1818:   */
1819:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1820:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1821:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1822:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1823:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1825:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1826:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1827:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1828:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1829:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1831:   /* ========================================================================= */
1832:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1833:   /* ========================================================================= */
1834:   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1835:   if (m)
1836:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1837:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

1839:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1840:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1842:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1843:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

1845:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1846:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1848:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1849:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1851:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1852:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1853:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1854:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1855:    */
1856:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1857:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1858:     fs->spsvBuffer_L = fs->factBuffer_M;
1859:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1860:   } else {
1861:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1862:     fs->spsvBuffer_U = fs->factBuffer_M;
1863:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1864:   }

1866:   /* ========================================================================== */
1867:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1868:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1869:   /* ========================================================================== */
1870:   int              structural_zero;
1871:   cusparseStatus_t status;

1873:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1874:   if (m)
1875:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1876:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1877:   if (PetscDefined(USE_DEBUG)) {
1878:     /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1879:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1880:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1881:   }

1883:   /* Estimate FLOPs of the numeric factorization */
1884:   {
1885:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1886:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1887:     PetscLogDouble flops = 0.0;

1889:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1890:     Ai    = Aseq->i;
1891:     Adiag = Aseq->diag;
1892:     for (PetscInt i = 0; i < m; i++) {
1893:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1894:         nzRow  = Ai[i + 1] - Ai[i];
1895:         nzLeft = Adiag[i] - Ai[i];
1896:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1897:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1898:         */
1899:         nzLeft = (nzRow - 1) / 2;
1900:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1901:       }
1902:     }
1903:     fs->numericFactFlops = flops;
1904:   }
1905:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1906:   PetscFunctionReturn(PETSC_SUCCESS);
1907: }

1909: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1910: {
1911:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1912:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1913:   const PetscScalar            *barray;
1914:   PetscScalar                  *xarray;

1916:   PetscFunctionBegin;
1917:   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1918:   PetscCall(VecCUDAGetArrayRead(b, &barray));
1919:   PetscCall(PetscLogGpuTimeBegin());

1921:   /* Solve L*y = b */
1922:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1923:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1924:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1925:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1927:   /* Solve Lt*x = y */
1928:   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1929:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1930:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1932:   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1933:   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));

1935:   PetscCall(PetscLogGpuTimeEnd());
1936:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1937:   PetscFunctionReturn(PETSC_SUCCESS);
1938: }

1940: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1941: {
1942:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1943:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1944:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1945:   CsrMatrix                    *Acsr;
1946:   PetscInt                      m, nz;
1947:   PetscBool                     flg;

1949:   PetscFunctionBegin;
1950:   if (PetscDefined(USE_DEBUG)) {
1951:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1952:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1953:   }

1955:   /* Copy A's value to fact */
1956:   m  = fact->rmap->n;
1957:   nz = aij->nz;
1958:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1959:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1960:   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

1962:   /* Factorize fact inplace */
1963:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1964:      csric02() only takes the lower triangular part of matrix A to perform factorization.
1965:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1966:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1967:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1968:    */
1969:   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1970:   if (PetscDefined(USE_DEBUG)) {
1971:     int              numerical_zero;
1972:     cusparseStatus_t status;
1973:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1974:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1975:   }

1977:   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1978:   if (fs->updatedSpSVAnalysis) {
1979:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1980:     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1981:   } else
1982:   #endif
1983:   {
1984:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1986:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1987:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1988:   */
1989:     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1990:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1991:   }

1993:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1994:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1995:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1996:   fact->ops->matsolve          = NULL;
1997:   fact->ops->matsolvetranspose = NULL;
1998:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1999:   PetscFunctionReturn(PETSC_SUCCESS);
2000: }

2002: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2003: {
2004:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2005:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2006:   PetscInt                      m, nz;

2008:   PetscFunctionBegin;
2009:   if (PetscDefined(USE_DEBUG)) {
2010:     PetscInt  i;
2011:     PetscBool flg, missing;

2013:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2014:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2015:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2016:     PetscCall(MatMissingDiagonal(A, &missing, &i));
2017:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2018:   }

2020:   /* Free the old stale stuff */
2021:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));

2023:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2024:      but they will not be used. Allocate them just for easy debugging.
2025:    */
2026:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

2028:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2029:   fact->factortype             = MAT_FACTOR_ICC;
2030:   fact->info.factor_mallocs    = 0;
2031:   fact->info.fill_ratio_given  = info->fill;
2032:   fact->info.fill_ratio_needed = 1.0;

2034:   aij->row = NULL;
2035:   aij->col = NULL;

2037:   /* ====================================================================== */
2038:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2039:   /* We'll do in-place factorization on fact                                */
2040:   /* ====================================================================== */
2041:   const int *Ai, *Aj;

2043:   m  = fact->rmap->n;
2044:   nz = aij->nz;

2046:   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2047:   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2048:   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2049:   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2050:   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2051:   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));

2053:   /* ====================================================================== */
2054:   /* Create mat descriptors for M, L                                        */
2055:   /* ====================================================================== */
2056:   cusparseFillMode_t fillMode;
2057:   cusparseDiagType_t diagType;

2059:   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2060:   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2061:   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));

2063:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2064:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2065:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2066:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2067:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2068:   */
2069:   fillMode = CUSPARSE_FILL_MODE_LOWER;
2070:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2071:   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2072:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2073:   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

2075:   /* ========================================================================= */
2076:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2077:   /* ========================================================================= */
2078:   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2079:   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

2081:   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2082:   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

2084:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2085:   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));

2087:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2088:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

2090:   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2091:   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

2093:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2094:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2095:    */
2096:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2097:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2098:     fs->spsvBuffer_L = fs->factBuffer_M;
2099:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2100:   } else {
2101:     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2102:     fs->spsvBuffer_Lt = fs->factBuffer_M;
2103:     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2104:   }

2106:   /* ========================================================================== */
2107:   /* Perform analysis of ic0 on M                                               */
2108:   /* The lower triangular part of M has the same sparsity pattern as L          */
2109:   /* ========================================================================== */
2110:   int              structural_zero;
2111:   cusparseStatus_t status;

2113:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2114:   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2115:   if (PetscDefined(USE_DEBUG)) {
2116:     /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2117:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2118:     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2119:   }

2121:   /* Estimate FLOPs of the numeric factorization */
2122:   {
2123:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
2124:     PetscInt      *Ai, nzRow, nzLeft;
2125:     PetscLogDouble flops = 0.0;

2127:     Ai = Aseq->i;
2128:     for (PetscInt i = 0; i < m; i++) {
2129:       nzRow = Ai[i + 1] - Ai[i];
2130:       if (nzRow > 1) {
2131:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2132:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2133:         */
2134:         nzLeft = (nzRow - 1) / 2;
2135:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2136:       }
2137:     }
2138:     fs->numericFactFlops = flops;
2139:   }
2140:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2141:   PetscFunctionReturn(PETSC_SUCCESS);
2142: }
2143: #endif

2145: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2146: {
2147:   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2148:   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);

2150:   PetscFunctionBegin;
2151:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2152:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2153:   B->offloadmask = PETSC_OFFLOAD_CPU;

2155:   if (!cusparsestruct->use_cpu_solve) {
2156: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2157:     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2158:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2159: #else
2160:     /* determine which version of MatSolve needs to be used. */
2161:     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2162:     IS          isrow = b->row, iscol = b->col;
2163:     PetscBool   row_identity, col_identity;

2165:     PetscCall(ISIdentity(isrow, &row_identity));
2166:     PetscCall(ISIdentity(iscol, &col_identity));
2167:     if (row_identity && col_identity) {
2168:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2169:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2170:     } else {
2171:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2172:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2173:     }
2174: #endif
2175:   }
2176:   B->ops->matsolve          = NULL;
2177:   B->ops->matsolvetranspose = NULL;

2179:   /* get the triangular factors */
2180:   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2181:   PetscFunctionReturn(PETSC_SUCCESS);
2182: }

2184: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2185: {
2186:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);

2188:   PetscFunctionBegin;
2189:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2190:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2191:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2192:   PetscFunctionReturn(PETSC_SUCCESS);
2193: }

2195: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2196: {
2197:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2199:   PetscFunctionBegin;
2200: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2201:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2202:   if (!info->factoronhost) {
2203:     PetscCall(ISIdentity(isrow, &row_identity));
2204:     PetscCall(ISIdentity(iscol, &col_identity));
2205:   }
2206:   if (!info->levels && row_identity && col_identity) {
2207:     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2208:   } else
2209: #endif
2210:   {
2211:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2212:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2213:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2214:   }
2215:   PetscFunctionReturn(PETSC_SUCCESS);
2216: }

2218: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2219: {
2220:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2222:   PetscFunctionBegin;
2223: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2224:   PetscBool perm_identity = PETSC_FALSE;
2225:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2226:   if (!info->levels && perm_identity) {
2227:     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2228:   } else
2229: #endif
2230:   {
2231:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2232:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2233:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2234:   }
2235:   PetscFunctionReturn(PETSC_SUCCESS);
2236: }

2238: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2239: {
2240:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

2242:   PetscFunctionBegin;
2243:   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2244:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2245:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2246:   PetscFunctionReturn(PETSC_SUCCESS);
2247: }

2249: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2250: {
2251:   PetscFunctionBegin;
2252:   *type = MATSOLVERCUSPARSE;
2253:   PetscFunctionReturn(PETSC_SUCCESS);
2254: }

2256: /*MC
2257:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2258:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2259:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2260:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2261:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2262:   algorithms are not recommended. This class does NOT support direct solver operations.

2264:   Level: beginner

2266: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2267:           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2268: M*/

2270: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2271: {
2272:   PetscInt n = A->rmap->n;

2274:   PetscFunctionBegin;
2275:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2276:   PetscCall(MatSetSizes(*B, n, n, n, n));
2277:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2278:   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));

2280:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2281:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2282:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2283:     if (!A->boundtocpu) {
2284:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2285:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2286:     } else {
2287:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2288:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2289:     }
2290:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2291:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2292:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2293:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2294:     if (!A->boundtocpu) {
2295:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2296:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2297:     } else {
2298:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2299:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2300:     }
2301:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2302:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2303:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

2305:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2306:   (*B)->canuseordering = PETSC_TRUE;
2307:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2308:   PetscFunctionReturn(PETSC_SUCCESS);
2309: }

2311: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2312: {
2313:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
2314:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2315: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2316:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2317: #endif

2319:   PetscFunctionBegin;
2320:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2321:     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2322:     if (A->factortype == MAT_FACTOR_NONE) {
2323:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2324:       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2325:     }
2326: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2327:     else if (fs->csrVal) {
2328:       /* We have a factorized matrix on device and are able to copy it to host */
2329:       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2330:     }
2331: #endif
2332:     else
2333:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2334:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2335:     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2336:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2337:   }
2338:   PetscFunctionReturn(PETSC_SUCCESS);
2339: }

2341: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2342: {
2343:   PetscFunctionBegin;
2344:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2345:   *array = ((Mat_SeqAIJ *)A->data)->a;
2346:   PetscFunctionReturn(PETSC_SUCCESS);
2347: }

2349: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2350: {
2351:   PetscFunctionBegin;
2352:   A->offloadmask = PETSC_OFFLOAD_CPU;
2353:   *array         = NULL;
2354:   PetscFunctionReturn(PETSC_SUCCESS);
2355: }

2357: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2358: {
2359:   PetscFunctionBegin;
2360:   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2361:   *array = ((Mat_SeqAIJ *)A->data)->a;
2362:   PetscFunctionReturn(PETSC_SUCCESS);
2363: }

2365: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2366: {
2367:   PetscFunctionBegin;
2368:   *array = NULL;
2369:   PetscFunctionReturn(PETSC_SUCCESS);
2370: }

2372: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2373: {
2374:   PetscFunctionBegin;
2375:   *array = ((Mat_SeqAIJ *)A->data)->a;
2376:   PetscFunctionReturn(PETSC_SUCCESS);
2377: }

2379: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2380: {
2381:   PetscFunctionBegin;
2382:   A->offloadmask = PETSC_OFFLOAD_CPU;
2383:   *array         = NULL;
2384:   PetscFunctionReturn(PETSC_SUCCESS);
2385: }

2387: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2388: {
2389:   Mat_SeqAIJCUSPARSE *cusp;
2390:   CsrMatrix          *matrix;

2392:   PetscFunctionBegin;
2393:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2394:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2395:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2396:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2397:   matrix = (CsrMatrix *)cusp->mat->mat;

2399:   if (i) {
2400: #if !defined(PETSC_USE_64BIT_INDICES)
2401:     *i = matrix->row_offsets->data().get();
2402: #else
2403:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2404: #endif
2405:   }
2406:   if (j) {
2407: #if !defined(PETSC_USE_64BIT_INDICES)
2408:     *j = matrix->column_indices->data().get();
2409: #else
2410:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2411: #endif
2412:   }
2413:   if (a) *a = matrix->values->data().get();
2414:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2415:   PetscFunctionReturn(PETSC_SUCCESS);
2416: }

2418: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2419: {
2420:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2421:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2422:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2423:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2424:   cusparseStatus_t              stat;
2425:   PetscBool                     both = PETSC_TRUE;

2427:   PetscFunctionBegin;
2428:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2429:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2430:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2431:       CsrMatrix *matrix;
2432:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2434:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2435:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2436:       matrix->values->assign(a->a, a->a + a->nz);
2437:       PetscCallCUDA(WaitForCUDA());
2438:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2439:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2440:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2441:     } else {
2442:       PetscInt nnz;
2443:       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2444:       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2445:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2446:       delete cusparsestruct->workVector;
2447:       delete cusparsestruct->rowoffsets_gpu;
2448:       cusparsestruct->workVector     = NULL;
2449:       cusparsestruct->rowoffsets_gpu = NULL;
2450:       try {
2451:         if (a->compressedrow.use) {
2452:           m    = a->compressedrow.nrows;
2453:           ii   = a->compressedrow.i;
2454:           ridx = a->compressedrow.rindex;
2455:         } else {
2456:           m    = A->rmap->n;
2457:           ii   = a->i;
2458:           ridx = NULL;
2459:         }
2460:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2461:         if (!a->a) {
2462:           nnz  = ii[m];
2463:           both = PETSC_FALSE;
2464:         } else nnz = a->nz;
2465:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2467:         /* create cusparse matrix */
2468:         cusparsestruct->nrows = m;
2469:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2470:         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2471:         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2472:         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));

2474:         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2475:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2476:         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2477:         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2478:         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479:         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480:         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));

2482:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2483:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2484:           /* set the matrix */
2485:           CsrMatrix *mat   = new CsrMatrix;
2486:           mat->num_rows    = m;
2487:           mat->num_cols    = A->cmap->n;
2488:           mat->num_entries = nnz;
2489:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2490:           mat->row_offsets->assign(ii, ii + m + 1);
2491:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2492:           mat->column_indices->assign(a->j, a->j + nnz);

2494:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2495:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2497:           /* assign the pointer */
2498:           matstruct->mat = mat;
2499: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2500:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2501:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2502:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2503:             PetscCallCUSPARSE(stat);
2504:           }
2505: #endif
2506:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2507: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2508:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2509: #else
2510:           CsrMatrix *mat   = new CsrMatrix;
2511:           mat->num_rows    = m;
2512:           mat->num_cols    = A->cmap->n;
2513:           mat->num_entries = nnz;
2514:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515:           mat->row_offsets->assign(ii, ii + m + 1);

2517:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518:           mat->column_indices->assign(a->j, a->j + nnz);

2520:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2523:           cusparseHybMat_t hybMat;
2524:           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2525:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2526:           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2527:           PetscCallCUSPARSE(stat);
2528:           /* assign the pointer */
2529:           matstruct->mat = hybMat;

2531:           if (mat) {
2532:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2533:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2534:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2535:             delete (CsrMatrix *)mat;
2536:           }
2537: #endif
2538:         }

2540:         /* assign the compressed row indices */
2541:         if (a->compressedrow.use) {
2542:           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2543:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2544:           matstruct->cprowIndices->assign(ridx, ridx + m);
2545:           tmp = m;
2546:         } else {
2547:           cusparsestruct->workVector = NULL;
2548:           matstruct->cprowIndices    = NULL;
2549:           tmp                        = 0;
2550:         }
2551:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2553:         /* assign the pointer */
2554:         cusparsestruct->mat = matstruct;
2555:       } catch (char *ex) {
2556:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2557:       }
2558:       PetscCallCUDA(WaitForCUDA());
2559:       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2560:       cusparsestruct->nonzerostate = A->nonzerostate;
2561:     }
2562:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2563:   }
2564:   PetscFunctionReturn(PETSC_SUCCESS);
2565: }

2567: struct VecCUDAPlusEquals {
2568:   template <typename Tuple>
2569:   __host__ __device__ void operator()(Tuple t)
2570:   {
2571:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2572:   }
2573: };

2575: struct VecCUDAEquals {
2576:   template <typename Tuple>
2577:   __host__ __device__ void operator()(Tuple t)
2578:   {
2579:     thrust::get<1>(t) = thrust::get<0>(t);
2580:   }
2581: };

2583: struct VecCUDAEqualsReverse {
2584:   template <typename Tuple>
2585:   __host__ __device__ void operator()(Tuple t)
2586:   {
2587:     thrust::get<0>(t) = thrust::get<1>(t);
2588:   }
2589: };

2591: struct MatMatCusparse {
2592:   PetscBool      cisdense;
2593:   PetscScalar   *Bt;
2594:   Mat            X;
2595:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2596:   PetscLogDouble flops;
2597:   CsrMatrix     *Bcsr;

2599: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2600:   cusparseSpMatDescr_t matSpBDescr;
2601:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2602:   cusparseDnMatDescr_t matBDescr;
2603:   cusparseDnMatDescr_t matCDescr;
2604:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2605:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2606:   void *dBuffer4;
2607:   void *dBuffer5;
2608:   #endif
2609:   size_t                mmBufferSize;
2610:   void                 *mmBuffer;
2611:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2612:   cusparseSpGEMMDescr_t spgemmDesc;
2613: #endif
2614: };

2616: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2617: {
2618:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2620:   PetscFunctionBegin;
2621:   PetscCallCUDA(cudaFree(mmdata->Bt));
2622:   delete mmdata->Bcsr;
2623: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2624:   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2625:   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2626:   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2627:   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2628:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2629:   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2630:   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2631:   #endif
2632:   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2633:   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2634: #endif
2635:   PetscCall(MatDestroy(&mmdata->X));
2636:   PetscCall(PetscFree(data));
2637:   PetscFunctionReturn(PETSC_SUCCESS);
2638: }

2640: #include <../src/mat/impls/dense/seq/dense.h>

2642: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2643: {
2644:   Mat_Product                  *product = C->product;
2645:   Mat                           A, B;
2646:   PetscInt                      m, n, blda, clda;
2647:   PetscBool                     flg, biscuda;
2648:   Mat_SeqAIJCUSPARSE           *cusp;
2649:   cusparseStatus_t              stat;
2650:   cusparseOperation_t           opA;
2651:   const PetscScalar            *barray;
2652:   PetscScalar                  *carray;
2653:   MatMatCusparse               *mmdata;
2654:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2655:   CsrMatrix                    *csrmat;

2657:   PetscFunctionBegin;
2658:   MatCheckProduct(C, 1);
2659:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2660:   mmdata = (MatMatCusparse *)product->data;
2661:   A      = product->A;
2662:   B      = product->B;
2663:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2664:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2665:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2666:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2667:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2668:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2669:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2670:   switch (product->type) {
2671:   case MATPRODUCT_AB:
2672:   case MATPRODUCT_PtAP:
2673:     mat = cusp->mat;
2674:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2675:     m   = A->rmap->n;
2676:     n   = B->cmap->n;
2677:     break;
2678:   case MATPRODUCT_AtB:
2679:     if (!A->form_explicit_transpose) {
2680:       mat = cusp->mat;
2681:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2682:     } else {
2683:       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2684:       mat = cusp->matTranspose;
2685:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2686:     }
2687:     m = A->cmap->n;
2688:     n = B->cmap->n;
2689:     break;
2690:   case MATPRODUCT_ABt:
2691:   case MATPRODUCT_RARt:
2692:     mat = cusp->mat;
2693:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2694:     m   = A->rmap->n;
2695:     n   = B->rmap->n;
2696:     break;
2697:   default:
2698:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2699:   }
2700:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2701:   csrmat = (CsrMatrix *)mat->mat;
2702:   /* if the user passed a CPU matrix, copy the data to the GPU */
2703:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2704:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2705:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));

2707:   PetscCall(MatDenseGetLDA(B, &blda));
2708:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2709:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2710:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2711:   } else {
2712:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2713:     PetscCall(MatDenseGetLDA(C, &clda));
2714:   }

2716:   PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2718:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2719:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2720:   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2721:   #else
2722:   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2723:   #endif

2725:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2726:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2727:     size_t mmBufferSize;
2728:     if (mmdata->initialized && mmdata->Blda != blda) {
2729:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2730:       mmdata->matBDescr = NULL;
2731:     }
2732:     if (!mmdata->matBDescr) {
2733:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2734:       mmdata->Blda = blda;
2735:     }

2737:     if (mmdata->initialized && mmdata->Clda != clda) {
2738:       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2739:       mmdata->matCDescr = NULL;
2740:     }
2741:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2742:       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2743:       mmdata->Clda = clda;
2744:     }

2746:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2747:     if (matADescr) {
2748:       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2749:       matADescr = NULL;
2750:     }
2751:   #endif

2753:     if (!matADescr) {
2754:       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2755:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2756:       PetscCallCUSPARSE(stat);
2757:     }

2759:     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));

2761:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2762:       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2763:       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2764:       mmdata->mmBufferSize = mmBufferSize;
2765:     }

2767:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2768:     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2769:   #endif

2771:     mmdata->initialized = PETSC_TRUE;
2772:   } else {
2773:     /* to be safe, always update pointers of the mats */
2774:     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2775:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2776:     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2777:   }

2779:   /* do cusparseSpMM, which supports transpose on B */
2780:   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2781: #else
2782:   PetscInt k;
2783:   /* cusparseXcsrmm does not support transpose on B */
2784:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2785:     cublasHandle_t cublasv2handle;
2786:     cublasStatus_t cerr;

2788:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2789:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2790:     PetscCallCUBLAS(cerr);
2791:     blda = B->cmap->n;
2792:     k    = B->cmap->n;
2793:   } else {
2794:     k = B->rmap->n;
2795:   }

2797:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2798:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2799:   PetscCallCUSPARSE(stat);
2800: #endif
2801:   PetscCall(PetscLogGpuTimeEnd());
2802:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2803:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2804:   if (product->type == MATPRODUCT_RARt) {
2805:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2806:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2807:   } else if (product->type == MATPRODUCT_PtAP) {
2808:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2809:     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2810:   } else {
2811:     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2812:   }
2813:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2814:   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2815:   PetscFunctionReturn(PETSC_SUCCESS);
2816: }

2818: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2819: {
2820:   Mat_Product        *product = C->product;
2821:   Mat                 A, B;
2822:   PetscInt            m, n;
2823:   PetscBool           cisdense, flg;
2824:   MatMatCusparse     *mmdata;
2825:   Mat_SeqAIJCUSPARSE *cusp;

2827:   PetscFunctionBegin;
2828:   MatCheckProduct(C, 1);
2829:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2830:   A = product->A;
2831:   B = product->B;
2832:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2833:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2834:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2835:   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2836:   switch (product->type) {
2837:   case MATPRODUCT_AB:
2838:     m = A->rmap->n;
2839:     n = B->cmap->n;
2840:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2841:     break;
2842:   case MATPRODUCT_AtB:
2843:     m = A->cmap->n;
2844:     n = B->cmap->n;
2845:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2846:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2847:     break;
2848:   case MATPRODUCT_ABt:
2849:     m = A->rmap->n;
2850:     n = B->rmap->n;
2851:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2852:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2853:     break;
2854:   case MATPRODUCT_PtAP:
2855:     m = B->cmap->n;
2856:     n = B->cmap->n;
2857:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2858:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2859:     break;
2860:   case MATPRODUCT_RARt:
2861:     m = B->rmap->n;
2862:     n = B->rmap->n;
2863:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2864:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2865:     break;
2866:   default:
2867:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2868:   }
2869:   PetscCall(MatSetSizes(C, m, n, m, n));
2870:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2871:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2872:   PetscCall(MatSetType(C, MATSEQDENSECUDA));

2874:   /* product data */
2875:   PetscCall(PetscNew(&mmdata));
2876:   mmdata->cisdense = cisdense;
2877: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2878:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2879:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2880: #endif
2881:   /* for these products we need intermediate storage */
2882:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2883:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2884:     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2885:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2886:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2887:     } else {
2888:       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2889:     }
2890:   }
2891:   C->product->data    = mmdata;
2892:   C->product->destroy = MatDestroy_MatMatCusparse;

2894:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2895:   PetscFunctionReturn(PETSC_SUCCESS);
2896: }

2898: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2899: {
2900:   Mat_Product                  *product = C->product;
2901:   Mat                           A, B;
2902:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2903:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2904:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2905:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2906:   PetscBool                     flg;
2907:   cusparseStatus_t              stat;
2908:   MatProductType                ptype;
2909:   MatMatCusparse               *mmdata;
2910: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2911:   cusparseSpMatDescr_t BmatSpDescr;
2912: #endif
2913:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2915:   PetscFunctionBegin;
2916:   MatCheckProduct(C, 1);
2917:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2918:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2919:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2920:   mmdata = (MatMatCusparse *)C->product->data;
2921:   A      = product->A;
2922:   B      = product->B;
2923:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2924:     mmdata->reusesym = PETSC_FALSE;
2925:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2926:     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2927:     Cmat = Ccusp->mat;
2928:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2929:     Ccsr = (CsrMatrix *)Cmat->mat;
2930:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2931:     goto finalize;
2932:   }
2933:   if (!c->nz) goto finalize;
2934:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2935:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2936:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2937:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2938:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2939:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2940:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2941:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2942:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2943:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2944:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2945:   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2946:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2947:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));

2949:   ptype = product->type;
2950:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2951:     ptype = MATPRODUCT_AB;
2952:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2953:   }
2954:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2955:     ptype = MATPRODUCT_AB;
2956:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2957:   }
2958:   switch (ptype) {
2959:   case MATPRODUCT_AB:
2960:     Amat = Acusp->mat;
2961:     Bmat = Bcusp->mat;
2962:     break;
2963:   case MATPRODUCT_AtB:
2964:     Amat = Acusp->matTranspose;
2965:     Bmat = Bcusp->mat;
2966:     break;
2967:   case MATPRODUCT_ABt:
2968:     Amat = Acusp->mat;
2969:     Bmat = Bcusp->matTranspose;
2970:     break;
2971:   default:
2972:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2973:   }
2974:   Cmat = Ccusp->mat;
2975:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2976:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2977:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2978:   Acsr = (CsrMatrix *)Amat->mat;
2979:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2980:   Ccsr = (CsrMatrix *)Cmat->mat;
2981:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2982:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2983:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2984:   PetscCall(PetscLogGpuTimeBegin());
2985: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2986:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2987:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2988:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2989:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2990:   PetscCallCUSPARSE(stat);
2991:   #else
2992:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2993:   PetscCallCUSPARSE(stat);
2994:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995:   PetscCallCUSPARSE(stat);
2996:   #endif
2997: #else
2998:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2999:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3000:   PetscCallCUSPARSE(stat);
3001: #endif
3002:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3003:   PetscCallCUDA(WaitForCUDA());
3004:   PetscCall(PetscLogGpuTimeEnd());
3005:   C->offloadmask = PETSC_OFFLOAD_GPU;
3006: finalize:
3007:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3008:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3009:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3010:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3011:   c->reallocs = 0;
3012:   C->info.mallocs += 0;
3013:   C->info.nz_unneeded = 0;
3014:   C->assembled = C->was_assembled = PETSC_TRUE;
3015:   C->num_ass++;
3016:   PetscFunctionReturn(PETSC_SUCCESS);
3017: }

3019: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3020: {
3021:   Mat_Product                  *product = C->product;
3022:   Mat                           A, B;
3023:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3024:   Mat_SeqAIJ                   *a, *b, *c;
3025:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3026:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3027:   PetscInt                      i, j, m, n, k;
3028:   PetscBool                     flg;
3029:   cusparseStatus_t              stat;
3030:   MatProductType                ptype;
3031:   MatMatCusparse               *mmdata;
3032:   PetscLogDouble                flops;
3033:   PetscBool                     biscompressed, ciscompressed;
3034: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3035:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3036:   cusparseSpMatDescr_t BmatSpDescr;
3037: #else
3038:   int cnz;
3039: #endif
3040:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

3042:   PetscFunctionBegin;
3043:   MatCheckProduct(C, 1);
3044:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3045:   A = product->A;
3046:   B = product->B;
3047:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3048:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3049:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3050:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3051:   a = (Mat_SeqAIJ *)A->data;
3052:   b = (Mat_SeqAIJ *)B->data;
3053:   /* product data */
3054:   PetscCall(PetscNew(&mmdata));
3055:   C->product->data    = mmdata;
3056:   C->product->destroy = MatDestroy_MatMatCusparse;

3058:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3059:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3060:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3061:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3062:   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3063:   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");

3065:   ptype = product->type;
3066:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3067:     ptype                                          = MATPRODUCT_AB;
3068:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3069:   }
3070:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3071:     ptype                                          = MATPRODUCT_AB;
3072:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3073:   }
3074:   biscompressed = PETSC_FALSE;
3075:   ciscompressed = PETSC_FALSE;
3076:   switch (ptype) {
3077:   case MATPRODUCT_AB:
3078:     m    = A->rmap->n;
3079:     n    = B->cmap->n;
3080:     k    = A->cmap->n;
3081:     Amat = Acusp->mat;
3082:     Bmat = Bcusp->mat;
3083:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3084:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3085:     break;
3086:   case MATPRODUCT_AtB:
3087:     m = A->cmap->n;
3088:     n = B->cmap->n;
3089:     k = A->rmap->n;
3090:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3091:     Amat = Acusp->matTranspose;
3092:     Bmat = Bcusp->mat;
3093:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3094:     break;
3095:   case MATPRODUCT_ABt:
3096:     m = A->rmap->n;
3097:     n = B->rmap->n;
3098:     k = A->cmap->n;
3099:     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3100:     Amat = Acusp->mat;
3101:     Bmat = Bcusp->matTranspose;
3102:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3103:     break;
3104:   default:
3105:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3106:   }

3108:   /* create cusparse matrix */
3109:   PetscCall(MatSetSizes(C, m, n, m, n));
3110:   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3111:   c     = (Mat_SeqAIJ *)C->data;
3112:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3113:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3114:   Ccsr  = new CsrMatrix;

3116:   c->compressedrow.use = ciscompressed;
3117:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3118:     c->compressedrow.nrows = a->compressedrow.nrows;
3119:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3120:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3121:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3122:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3123:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3124:   } else {
3125:     c->compressedrow.nrows  = 0;
3126:     c->compressedrow.i      = NULL;
3127:     c->compressedrow.rindex = NULL;
3128:     Ccusp->workVector       = NULL;
3129:     Cmat->cprowIndices      = NULL;
3130:   }
3131:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3132:   Ccusp->mat        = Cmat;
3133:   Ccusp->mat->mat   = Ccsr;
3134:   Ccsr->num_rows    = Ccusp->nrows;
3135:   Ccsr->num_cols    = n;
3136:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3137:   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3138:   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3139:   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3140:   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3141:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3142:   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3143:   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3144:   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3145:   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3146:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3147:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3148:     c->nz                = 0;
3149:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3150:     Ccsr->values         = new THRUSTARRAY(c->nz);
3151:     goto finalizesym;
3152:   }

3154:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3155:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3156:   Acsr = (CsrMatrix *)Amat->mat;
3157:   if (!biscompressed) {
3158:     Bcsr = (CsrMatrix *)Bmat->mat;
3159: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3160:     BmatSpDescr = Bmat->matDescr;
3161: #endif
3162:   } else { /* we need to use row offsets for the full matrix */
3163:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3164:     Bcsr                 = new CsrMatrix;
3165:     Bcsr->num_rows       = B->rmap->n;
3166:     Bcsr->num_cols       = cBcsr->num_cols;
3167:     Bcsr->num_entries    = cBcsr->num_entries;
3168:     Bcsr->column_indices = cBcsr->column_indices;
3169:     Bcsr->values         = cBcsr->values;
3170:     if (!Bcusp->rowoffsets_gpu) {
3171:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3172:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3173:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3174:     }
3175:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3176:     mmdata->Bcsr      = Bcsr;
3177: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3178:     if (Bcsr->num_rows && Bcsr->num_cols) {
3179:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3180:       PetscCallCUSPARSE(stat);
3181:     }
3182:     BmatSpDescr = mmdata->matSpBDescr;
3183: #endif
3184:   }
3185:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3186:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3187:   /* precompute flops count */
3188:   if (ptype == MATPRODUCT_AB) {
3189:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3190:       const PetscInt st = a->i[i];
3191:       const PetscInt en = a->i[i + 1];
3192:       for (j = st; j < en; j++) {
3193:         const PetscInt brow = a->j[j];
3194:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3195:       }
3196:     }
3197:   } else if (ptype == MATPRODUCT_AtB) {
3198:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3199:       const PetscInt anzi = a->i[i + 1] - a->i[i];
3200:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3201:       flops += (2. * anzi) * bnzi;
3202:     }
3203:   } else { /* TODO */
3204:     flops = 0.;
3205:   }

3207:   mmdata->flops = flops;
3208:   PetscCall(PetscLogGpuTimeBegin());

3210: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3211:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3212:   // cuda-12.2 requires non-null csrRowOffsets
3213:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3214:   PetscCallCUSPARSE(stat);
3215:   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3216:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3217:   {
3218:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3219:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3220:   */
3221:     void *dBuffer1 = NULL;
3222:     void *dBuffer2 = NULL;
3223:     void *dBuffer3 = NULL;
3224:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3225:     size_t bufferSize1 = 0;
3226:     size_t bufferSize2 = 0;
3227:     size_t bufferSize3 = 0;
3228:     size_t bufferSize4 = 0;
3229:     size_t bufferSize5 = 0;

3231:     /* ask bufferSize1 bytes for external memory */
3232:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3233:     PetscCallCUSPARSE(stat);
3234:     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3235:     /* inspect the matrices A and B to understand the memory requirement for the next step */
3236:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3237:     PetscCallCUSPARSE(stat);

3239:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3240:     PetscCallCUSPARSE(stat);
3241:     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3242:     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3243:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3244:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3245:     PetscCallCUSPARSE(stat);
3246:     PetscCallCUDA(cudaFree(dBuffer1));
3247:     PetscCallCUDA(cudaFree(dBuffer2));

3249:     /* get matrix C non-zero entries C_nnz1 */
3250:     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3251:     c->nz = (PetscInt)C_nnz1;
3252:     /* allocate matrix C */
3253:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3254:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3255:     Ccsr->values = new THRUSTARRAY(c->nz);
3256:     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3257:     /* update matC with the new pointers */
3258:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3259:     PetscCallCUSPARSE(stat);

3261:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3262:     PetscCallCUSPARSE(stat);
3263:     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3264:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3265:     PetscCallCUSPARSE(stat);
3266:     PetscCallCUDA(cudaFree(dBuffer3));
3267:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3268:     PetscCallCUSPARSE(stat);
3269:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3270:   }
3271:   #else
3272:   size_t bufSize2;
3273:   /* ask bufferSize bytes for external memory */
3274:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3275:   PetscCallCUSPARSE(stat);
3276:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3277:   /* inspect the matrices A and B to understand the memory requirement for the next step */
3278:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3279:   PetscCallCUSPARSE(stat);
3280:   /* ask bufferSize again bytes for external memory */
3281:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3282:   PetscCallCUSPARSE(stat);
3283:   /* The CUSPARSE documentation is not clear, nor the API
3284:      We need both buffers to perform the operations properly!
3285:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3286:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3287:      is stored in the descriptor! What a messy API... */
3288:   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3289:   /* compute the intermediate product of A * B */
3290:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3291:   PetscCallCUSPARSE(stat);
3292:   /* get matrix C non-zero entries C_nnz1 */
3293:   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3294:   c->nz = (PetscInt)C_nnz1;
3295:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3296:                       mmdata->mmBufferSize / 1024));
3297:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3298:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3299:   Ccsr->values = new THRUSTARRAY(c->nz);
3300:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3301:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3302:   PetscCallCUSPARSE(stat);
3303:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3304:   PetscCallCUSPARSE(stat);
3305:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3306: #else
3307:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3308:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3309:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3310:   PetscCallCUSPARSE(stat);
3311:   c->nz                = cnz;
3312:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3313:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3314:   Ccsr->values = new THRUSTARRAY(c->nz);
3315:   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */

3317:   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3318:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3319:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3320:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3321:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3322:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3323:   PetscCallCUSPARSE(stat);
3324: #endif
3325:   PetscCall(PetscLogGpuFlops(mmdata->flops));
3326:   PetscCall(PetscLogGpuTimeEnd());
3327: finalizesym:
3328:   c->free_a = PETSC_TRUE;
3329:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3330:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3331:   c->free_ij = PETSC_TRUE;
3332:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3333:     PetscInt      *d_i = c->i;
3334:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3335:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3336:     ii = *Ccsr->row_offsets;
3337:     jj = *Ccsr->column_indices;
3338:     if (ciscompressed) d_i = c->compressedrow.i;
3339:     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3340:     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3341:   } else {
3342:     PetscInt *d_i = c->i;
3343:     if (ciscompressed) d_i = c->compressedrow.i;
3344:     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345:     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346:   }
3347:   if (ciscompressed) { /* need to expand host row offsets */
3348:     PetscInt r = 0;
3349:     c->i[0]    = 0;
3350:     for (k = 0; k < c->compressedrow.nrows; k++) {
3351:       const PetscInt next = c->compressedrow.rindex[k];
3352:       const PetscInt old  = c->compressedrow.i[k];
3353:       for (; r < next; r++) c->i[r + 1] = old;
3354:     }
3355:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3356:   }
3357:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3358:   PetscCall(PetscMalloc1(m, &c->ilen));
3359:   PetscCall(PetscMalloc1(m, &c->imax));
3360:   c->maxnz         = c->nz;
3361:   c->nonzerorowcnt = 0;
3362:   c->rmax          = 0;
3363:   for (k = 0; k < m; k++) {
3364:     const PetscInt nn = c->i[k + 1] - c->i[k];
3365:     c->ilen[k] = c->imax[k] = nn;
3366:     c->nonzerorowcnt += (PetscInt)!!nn;
3367:     c->rmax = PetscMax(c->rmax, nn);
3368:   }
3369:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3370:   PetscCall(PetscMalloc1(c->nz, &c->a));
3371:   Ccsr->num_entries = c->nz;

3373:   C->nonzerostate++;
3374:   PetscCall(PetscLayoutSetUp(C->rmap));
3375:   PetscCall(PetscLayoutSetUp(C->cmap));
3376:   Ccusp->nonzerostate = C->nonzerostate;
3377:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3378:   C->preallocated     = PETSC_TRUE;
3379:   C->assembled        = PETSC_FALSE;
3380:   C->was_assembled    = PETSC_FALSE;
3381:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382:     mmdata->reusesym = PETSC_TRUE;
3383:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3384:   }
3385:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3386:   PetscFunctionReturn(PETSC_SUCCESS);
3387: }

3389: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3391: /* handles sparse or dense B */
3392: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393: {
3394:   Mat_Product *product = mat->product;
3395:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3397:   PetscFunctionBegin;
3398:   MatCheckProduct(mat, 1);
3399:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3400:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401:   if (product->type == MATPRODUCT_ABC) {
3402:     Ciscusp = PETSC_FALSE;
3403:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404:   }
3405:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3406:     PetscBool usecpu = PETSC_FALSE;
3407:     switch (product->type) {
3408:     case MATPRODUCT_AB:
3409:       if (product->api_user) {
3410:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3411:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412:         PetscOptionsEnd();
3413:       } else {
3414:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3415:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416:         PetscOptionsEnd();
3417:       }
3418:       break;
3419:     case MATPRODUCT_AtB:
3420:       if (product->api_user) {
3421:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3422:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423:         PetscOptionsEnd();
3424:       } else {
3425:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3426:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427:         PetscOptionsEnd();
3428:       }
3429:       break;
3430:     case MATPRODUCT_PtAP:
3431:       if (product->api_user) {
3432:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3433:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434:         PetscOptionsEnd();
3435:       } else {
3436:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3437:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438:         PetscOptionsEnd();
3439:       }
3440:       break;
3441:     case MATPRODUCT_RARt:
3442:       if (product->api_user) {
3443:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3444:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445:         PetscOptionsEnd();
3446:       } else {
3447:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3448:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449:         PetscOptionsEnd();
3450:       }
3451:       break;
3452:     case MATPRODUCT_ABC:
3453:       if (product->api_user) {
3454:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3455:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456:         PetscOptionsEnd();
3457:       } else {
3458:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3459:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460:         PetscOptionsEnd();
3461:       }
3462:       break;
3463:     default:
3464:       break;
3465:     }
3466:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3467:   }
3468:   /* dispatch */
3469:   if (isdense) {
3470:     switch (product->type) {
3471:     case MATPRODUCT_AB:
3472:     case MATPRODUCT_AtB:
3473:     case MATPRODUCT_ABt:
3474:     case MATPRODUCT_PtAP:
3475:     case MATPRODUCT_RARt:
3476:       if (product->A->boundtocpu) {
3477:         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478:       } else {
3479:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480:       }
3481:       break;
3482:     case MATPRODUCT_ABC:
3483:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484:       break;
3485:     default:
3486:       break;
3487:     }
3488:   } else if (Biscusp && Ciscusp) {
3489:     switch (product->type) {
3490:     case MATPRODUCT_AB:
3491:     case MATPRODUCT_AtB:
3492:     case MATPRODUCT_ABt:
3493:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494:       break;
3495:     case MATPRODUCT_PtAP:
3496:     case MATPRODUCT_RARt:
3497:     case MATPRODUCT_ABC:
3498:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499:       break;
3500:     default:
3501:       break;
3502:     }
3503:   } else { /* fallback for AIJ */
3504:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505:   }
3506:   PetscFunctionReturn(PETSC_SUCCESS);
3507: }

3509: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510: {
3511:   PetscFunctionBegin;
3512:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3513:   PetscFunctionReturn(PETSC_SUCCESS);
3514: }

3516: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517: {
3518:   PetscFunctionBegin;
3519:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3520:   PetscFunctionReturn(PETSC_SUCCESS);
3521: }

3523: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524: {
3525:   PetscFunctionBegin;
3526:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3527:   PetscFunctionReturn(PETSC_SUCCESS);
3528: }

3530: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531: {
3532:   PetscFunctionBegin;
3533:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3534:   PetscFunctionReturn(PETSC_SUCCESS);
3535: }

3537: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538: {
3539:   PetscFunctionBegin;
3540:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3541:   PetscFunctionReturn(PETSC_SUCCESS);
3542: }

3544: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545: {
3546:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3547:   if (i < n) y[idx[i]] += x[i];
3548: }

3550: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552: {
3553:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3554:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3555:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3557:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558:   PetscBool                     compressed;
3559: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560:   PetscInt nx, ny;
3561: #endif

3563:   PetscFunctionBegin;
3564:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565:   if (!a->nz) {
3566:     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567:     else PetscCall(VecSeq_CUDA::Set(zz, 0));
3568:     PetscFunctionReturn(PETSC_SUCCESS);
3569:   }
3570:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3571:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572:   if (!trans) {
3573:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3574:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575:   } else {
3576:     if (herm || !A->form_explicit_transpose) {
3577:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579:     } else {
3580:       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582:     }
3583:   }
3584:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3587:   try {
3588:     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3589:     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3590:     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3592:     PetscCall(PetscLogGpuTimeBegin());
3593:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594:       /* z = A x + beta y.
3595:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597:       */
3598:       xptr = xarray;
3599:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603:           allocated to accommodate different uses. So we get the length info directly from mat.
3604:        */
3605:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607:         nx             = mat->num_cols; // since y = Ax
3608:         ny             = mat->num_rows;
3609:       }
3610: #endif
3611:     } else {
3612:       /* z = A^T x + beta y
3613:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615:        */
3616:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617:       dptr = zarray;
3618:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619:       if (compressed) { /* Scatter x to work vector */
3620:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3622:         thrust::for_each(
3623: #if PetscDefined(HAVE_THRUST_ASYNC)
3624:           thrust::cuda::par.on(PetscDefaultCudaStream),
3625: #endif
3626:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3627:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628:       }
3629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632:         nx             = mat->num_rows; // since y = A^T x
3633:         ny             = mat->num_cols;
3634:       }
3635: #endif
3636:     }

3638:     /* csr_spmv does y = alpha op(A) x + beta y */
3639:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643:   #else
3644:       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645:   #endif

3647:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649:       if (!matDescr) {
3650:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651:         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652:       }
3653:   #endif

3655:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3656:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3657:         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3658:         PetscCallCUSPARSE(
3659:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3660:         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662:         PetscCallCUSPARSE(
3663:           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664:   #endif
3665:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666:       } else {
3667:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3668:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3669:         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670:       }

3672:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673: #else
3674:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3675:       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676: #endif
3677:     } else {
3678:       if (cusparsestruct->nrows) {
3679: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681: #else
3682:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3683:         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684: #endif
3685:       }
3686:     }
3687:     PetscCall(PetscLogGpuTimeEnd());

3689:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690:       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3691:         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692:           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3693:         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694:           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3695:         }
3696:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697:         PetscCall(VecSeq_CUDA::Set(zz, 0));
3698:       }

3700:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701:       if (compressed) {
3702:         PetscCall(PetscLogGpuTimeBegin());
3703:         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3704:         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3705:         PetscCall(PetscLogGpuTimeEnd());
3706:       }
3707:     } else {
3708:       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709:     }
3710:     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3711:     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3712:     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713:   } catch (char *ex) {
3714:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715:   }
3716:   if (yy) {
3717:     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718:   } else {
3719:     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720:   }
3721:   PetscFunctionReturn(PETSC_SUCCESS);
3722: }

3724: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725: {
3726:   PetscFunctionBegin;
3727:   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3728:   PetscFunctionReturn(PETSC_SUCCESS);
3729: }

3731: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);

3733: __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3734: {
3735:   const size_t x = blockIdx.x * blockDim.x + threadIdx.x;

3737:   if (x < len) {
3738:     const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3739:     PetscScalar    d = 0.0;

3741:     for (PetscInt i = 0; i < num_non0_row; i++) {
3742:       if (col[i + rowx] == x) {
3743:         d = val[i + rowx];
3744:         break;
3745:       }
3746:     }
3747:     diag[x] = d;
3748:   }
3749: }

3751: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3752: {
3753:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3754:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3755:   PetscScalar                  *darray;

3757:   PetscFunctionBegin;
3758:   if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3759:     PetscInt   n   = A->rmap->n;
3760:     CsrMatrix *mat = (CsrMatrix *)matstruct->mat;

3762:     PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3763:     if (n > 0) {
3764:       PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3765:       GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3766:       PetscCallCUDA(cudaPeekAtLastError());
3767:       PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3768:     }
3769:   } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3770:   PetscFunctionReturn(PETSC_SUCCESS);
3771: }

3773: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774: {
3775:   PetscFunctionBegin;
3776:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3777:   PetscFunctionReturn(PETSC_SUCCESS);
3778: }

3780: /*@
3781:   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs

3783:   Collective

3785:   Input Parameters:
3786: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3787: . m    - number of rows
3788: . n    - number of columns
3789: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3790: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3792:   Output Parameter:
3793: . A - the matrix

3795:   Level: intermediate

3797:   Notes:
3798:   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3799:   calculations. For good matrix assembly performance the user should preallocate the matrix
3800:   storage by setting the parameter `nz` (or the array `nnz`).

3802:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3803:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
3804:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3806:   The AIJ format, also called
3807:   compressed row storage, is fully compatible with standard Fortran
3808:   storage.  That is, the stored row and column indices can begin at
3809:   either one (as in Fortran) or zero.

3811:   Specify the preallocated storage with either nz or nnz (not both).
3812:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3813:   allocation.

3815:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

3817: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3818:           `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3819: @*/
3820: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821: {
3822:   PetscFunctionBegin;
3823:   PetscCall(MatCreate(comm, A));
3824:   PetscCall(MatSetSizes(*A, m, n, m, n));
3825:   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3826:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3827:   PetscFunctionReturn(PETSC_SUCCESS);
3828: }

3830: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831: {
3832:   PetscFunctionBegin;
3833:   if (A->factortype == MAT_FACTOR_NONE) {
3834:     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3835:   } else {
3836:     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837:   }
3838:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3839:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3840:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3841:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3842:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3843:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3844:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3845:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3846:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3847:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3848:   PetscCall(MatDestroy_SeqAIJ(A));
3849:   PetscFunctionReturn(PETSC_SUCCESS);
3850: }

3852: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3853: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855: {
3856:   PetscFunctionBegin;
3857:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3858:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3859:   PetscFunctionReturn(PETSC_SUCCESS);
3860: }

3862: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863: {
3864:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865:   Mat_SeqAIJCUSPARSE *cy;
3866:   Mat_SeqAIJCUSPARSE *cx;
3867:   PetscScalar        *ay;
3868:   const PetscScalar  *ax;
3869:   CsrMatrix          *csry, *csrx;

3871:   PetscFunctionBegin;
3872:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874:   if (X->ops->axpy != Y->ops->axpy) {
3875:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3876:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3877:     PetscFunctionReturn(PETSC_SUCCESS);
3878:   }
3879:   /* if we are here, it means both matrices are bound to GPU */
3880:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3881:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3882:   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3883:   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884:   csry = (CsrMatrix *)cy->mat->mat;
3885:   csrx = (CsrMatrix *)cx->mat->mat;
3886:   /* see if we can turn this into a cublas axpy */
3887:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890:     if (eq) str = SAME_NONZERO_PATTERN;
3891:   }
3892:   /* spgeam is buggy with one column */
3893:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3895:   if (str == SUBSET_NONZERO_PATTERN) {
3896:     PetscScalar b = 1.0;
3897: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898:     size_t bufferSize;
3899:     void  *buffer;
3900: #endif

3902:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3903:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3904:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3906:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3907:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3908:     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3909:     PetscCall(PetscLogGpuTimeBegin());
3910:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3912:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3913:     PetscCall(PetscLogGpuTimeEnd());
3914:     PetscCallCUDA(cudaFree(buffer));
3915: #else
3916:     PetscCall(PetscLogGpuTimeBegin());
3917:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3918:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3919:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3920:     PetscCall(PetscLogGpuTimeEnd());
3921: #endif
3922:     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3923:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3924:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3925:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3926:   } else if (str == SAME_NONZERO_PATTERN) {
3927:     cublasHandle_t cublasv2handle;
3928:     PetscBLASInt   one = 1, bnz = 1;

3930:     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3931:     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3932:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3933:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3934:     PetscCall(PetscLogGpuTimeBegin());
3935:     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3936:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3937:     PetscCall(PetscLogGpuTimeEnd());
3938:     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3939:     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3940:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3941:   } else {
3942:     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3943:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3944:   }
3945:   PetscFunctionReturn(PETSC_SUCCESS);
3946: }

3948: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3949: {
3950:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3951:   PetscScalar   *ay;
3952:   cublasHandle_t cublasv2handle;
3953:   PetscBLASInt   one = 1, bnz = 1;

3955:   PetscFunctionBegin;
3956:   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3957:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3958:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3959:   PetscCall(PetscLogGpuTimeBegin());
3960:   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3961:   PetscCall(PetscLogGpuFlops(bnz));
3962:   PetscCall(PetscLogGpuTimeEnd());
3963:   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3964:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3965:   PetscFunctionReturn(PETSC_SUCCESS);
3966: }

3968: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3969: {
3970:   PetscBool   gpu = PETSC_FALSE;
3971:   Mat_SeqAIJ *a   = (Mat_SeqAIJ *)A->data;

3973:   PetscFunctionBegin;
3974:   if (A->factortype == MAT_FACTOR_NONE) {
3975:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3976:     if (spptr->mat) {
3977:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3978:       if (matrix->values) {
3979:         gpu = PETSC_TRUE;
3980:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3981:       }
3982:     }
3983:     if (spptr->matTranspose) {
3984:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3985:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3986:     }
3987:   }
3988:   if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3989:   else {
3990:     PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3991:     A->offloadmask = PETSC_OFFLOAD_CPU;
3992:   }
3993:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3994:   PetscFunctionReturn(PETSC_SUCCESS);
3995: }

3997: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3998: {
3999:   PetscFunctionBegin;
4000:   *m = PETSC_MEMTYPE_CUDA;
4001:   PetscFunctionReturn(PETSC_SUCCESS);
4002: }

4004: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4005: {
4006:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

4008:   PetscFunctionBegin;
4009:   if (A->factortype != MAT_FACTOR_NONE) {
4010:     A->boundtocpu = flg;
4011:     PetscFunctionReturn(PETSC_SUCCESS);
4012:   }
4013:   if (flg) {
4014:     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));

4016:     A->ops->scale                     = MatScale_SeqAIJ;
4017:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
4018:     A->ops->axpy                      = MatAXPY_SeqAIJ;
4019:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4020:     A->ops->mult                      = MatMult_SeqAIJ;
4021:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4022:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4023:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4024:     A->ops->multhermitiantranspose    = NULL;
4025:     A->ops->multhermitiantransposeadd = NULL;
4026:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4027:     A->ops->getcurrentmemtype         = NULL;
4028:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4029:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4030:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4031:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4032:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4033:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4034:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4035:   } else {
4036:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4037:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJCUSPARSE;
4038:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4039:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4040:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4041:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4042:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4043:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4044:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4045:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4046:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4047:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJCUSPARSE;
4048:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4049:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4050:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4051:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4052:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4053:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4054:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

4056:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4057:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4059:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4060:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4061:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4062:   }
4063:   A->boundtocpu = flg;
4064:   if (flg && a->inode.size_csr) {
4065:     a->inode.use = PETSC_TRUE;
4066:   } else {
4067:     a->inode.use = PETSC_FALSE;
4068:   }
4069:   PetscFunctionReturn(PETSC_SUCCESS);
4070: }

4072: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4073: {
4074:   Mat B;

4076:   PetscFunctionBegin;
4077:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4078:   if (reuse == MAT_INITIAL_MATRIX) {
4079:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4080:   } else if (reuse == MAT_REUSE_MATRIX) {
4081:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4082:   }
4083:   B = *newmat;

4085:   PetscCall(PetscFree(B->defaultvectype));
4086:   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));

4088:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4089:     if (B->factortype == MAT_FACTOR_NONE) {
4090:       Mat_SeqAIJCUSPARSE *spptr;
4091:       PetscCall(PetscNew(&spptr));
4092:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4093:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4094:       spptr->format = MAT_CUSPARSE_CSR;
4095: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4096:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4097:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4098:   #else
4099:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4100:   #endif
4101:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4102:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4103: #endif
4104:       B->spptr = spptr;
4105:     } else {
4106:       Mat_SeqAIJCUSPARSETriFactors *spptr;

4108:       PetscCall(PetscNew(&spptr));
4109:       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4110:       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4111:       B->spptr = spptr;
4112:     }
4113:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4114:   }
4115:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJCUSPARSE;
4116:   B->ops->destroy           = MatDestroy_SeqAIJCUSPARSE;
4117:   B->ops->setoption         = MatSetOption_SeqAIJCUSPARSE;
4118:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJCUSPARSE;
4119:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJCUSPARSE;
4120:   B->ops->duplicate         = MatDuplicate_SeqAIJCUSPARSE;
4121:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;

4123:   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4124:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4125:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4126: #if defined(PETSC_HAVE_HYPRE)
4127:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4128: #endif
4129:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4130:   PetscFunctionReturn(PETSC_SUCCESS);
4131: }

4133: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4134: {
4135:   PetscFunctionBegin;
4136:   PetscCall(MatCreate_SeqAIJ(B));
4137:   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4138:   PetscFunctionReturn(PETSC_SUCCESS);
4139: }

4141: /*MC
4142:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.

4144:    Options Database Keys:
4145: +  -mat_type aijcusparse                 - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4146: .  -mat_cusparse_storage_format csr      - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4147:                                            Other options include ell (ellpack) or hyb (hybrid).
4148: .  -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4149: -  -mat_cusparse_use_cpu_solve           - Performs the `MatSolve()` on the CPU

4151:   Level: beginner

4153:   Notes:
4154:   These matrices can be in either CSR, ELL, or HYB format.

4156:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

4158:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4159:   if some integer values passed in do not fit in `int`.

4161: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4162: M*/

4164: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4165: {
4166:   PetscFunctionBegin;
4167:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4168:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4169:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4170:   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4171:   PetscFunctionReturn(PETSC_SUCCESS);
4172: }

4174: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4175: {
4176:   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);

4178:   PetscFunctionBegin;
4179:   if (cusp) {
4180:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4181:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4182:     delete cusp->workVector;
4183:     delete cusp->rowoffsets_gpu;
4184:     delete cusp->csr2csc_i;
4185:     delete cusp->coords;
4186:     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4187:     PetscCall(PetscFree(mat->spptr));
4188:   }
4189:   PetscFunctionReturn(PETSC_SUCCESS);
4190: }

4192: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4193: {
4194:   PetscFunctionBegin;
4195:   if (*mat) {
4196:     delete (*mat)->values;
4197:     delete (*mat)->column_indices;
4198:     delete (*mat)->row_offsets;
4199:     delete *mat;
4200:     *mat = 0;
4201:   }
4202:   PetscFunctionReturn(PETSC_SUCCESS);
4203: }

4205: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4206: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4207: {
4208:   PetscFunctionBegin;
4209:   if (*trifactor) {
4210:     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4211:     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4212:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4213:     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4214:     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4215:   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216:     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4217:   #endif
4218:     PetscCall(PetscFree(*trifactor));
4219:   }
4220:   PetscFunctionReturn(PETSC_SUCCESS);
4221: }
4222: #endif

4224: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4225: {
4226:   CsrMatrix *mat;

4228:   PetscFunctionBegin;
4229:   if (*matstruct) {
4230:     if ((*matstruct)->mat) {
4231:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4232: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4233:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4234: #else
4235:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4236:         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4237: #endif
4238:       } else {
4239:         mat = (CsrMatrix *)(*matstruct)->mat;
4240:         PetscCall(CsrMatrix_Destroy(&mat));
4241:       }
4242:     }
4243:     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4244:     delete (*matstruct)->cprowIndices;
4245:     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4246:     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4247:     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));

4249: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4250:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4251:     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));

4253:     for (int i = 0; i < 3; i++) {
4254:       if (mdata->cuSpMV[i].initialized) {
4255:         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4256:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4257:         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4258:   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4259:         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4260:         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4261:   #endif
4262:       }
4263:     }
4264: #endif
4265:     delete *matstruct;
4266:     *matstruct = NULL;
4267:   }
4268:   PetscFunctionReturn(PETSC_SUCCESS);
4269: }

4271: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4272: {
4273:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

4275:   PetscFunctionBegin;
4276:   if (fs) {
4277: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4278:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4279:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4280:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4281:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4282:     delete fs->workVector;
4283:     fs->workVector = NULL;
4284: #endif
4285:     delete fs->rpermIndices;
4286:     delete fs->cpermIndices;
4287:     fs->rpermIndices  = NULL;
4288:     fs->cpermIndices  = NULL;
4289:     fs->init_dev_prop = PETSC_FALSE;
4290: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4291:     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4292:     PetscCallCUDA(cudaFree(fs->csrColIdx));
4293:     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4294:     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4295:     PetscCallCUDA(cudaFree(fs->csrVal));
4296:     PetscCallCUDA(cudaFree(fs->diag));
4297:     PetscCallCUDA(cudaFree(fs->X));
4298:     PetscCallCUDA(cudaFree(fs->Y));
4299:     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4300:     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4301:     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4302:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4303:     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4304:     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4305:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4306:     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4307:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4308:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4309:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4310:     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4311:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4312:     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4313:     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4314:     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4315:     PetscCall(PetscFree(fs->csrRowPtr_h));
4316:     PetscCall(PetscFree(fs->csrVal_h));
4317:     PetscCall(PetscFree(fs->diag_h));
4318:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4319:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4320: #endif
4321:   }
4322:   PetscFunctionReturn(PETSC_SUCCESS);
4323: }

4325: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4326: {
4327:   PetscFunctionBegin;
4328:   if (*trifactors) {
4329:     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4330:     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4331:     PetscCall(PetscFree(*trifactors));
4332:   }
4333:   PetscFunctionReturn(PETSC_SUCCESS);
4334: }

4336: struct IJCompare {
4337:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4338:   {
4339:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4340:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4341:     return false;
4342:   }
4343: };

4345: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4346: {
4347:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4349:   PetscFunctionBegin;
4350:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4352:   if (destroy) {
4353:     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4354:     delete cusp->csr2csc_i;
4355:     cusp->csr2csc_i = NULL;
4356:   }
4357:   A->transupdated = PETSC_FALSE;
4358:   PetscFunctionReturn(PETSC_SUCCESS);
4359: }

4361: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4362: {
4363:   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;

4365:   PetscFunctionBegin;
4366:   PetscCallCUDA(cudaFree(coo->perm));
4367:   PetscCallCUDA(cudaFree(coo->jmap));
4368:   PetscCall(PetscFree(coo));
4369:   PetscFunctionReturn(PETSC_SUCCESS);
4370: }

4372: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4373: {
4374:   PetscBool            dev_ij = PETSC_FALSE;
4375:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
4376:   PetscInt            *i, *j;
4377:   PetscContainer       container_h;
4378:   MatCOOStruct_SeqAIJ *coo_h, *coo_d;

4380:   PetscFunctionBegin;
4381:   PetscCall(PetscGetMemType(coo_i, &mtype));
4382:   if (PetscMemTypeDevice(mtype)) {
4383:     dev_ij = PETSC_TRUE;
4384:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4385:     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4386:     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4387:   } else {
4388:     i = coo_i;
4389:     j = coo_j;
4390:   }

4392:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4393:   if (dev_ij) PetscCall(PetscFree2(i, j));
4394:   mat->offloadmask = PETSC_OFFLOAD_CPU;
4395:   // Create the GPU memory
4396:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));

4398:   // Copy the COO struct to device
4399:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4400:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4401:   PetscCall(PetscMalloc1(1, &coo_d));
4402:   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4403:   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4404:   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4405:   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4406:   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));

4408:   // Put the COO struct in a container and then attach that to the matrix
4409:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4410:   PetscFunctionReturn(PETSC_SUCCESS);
4411: }

4413: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4414: {
4415:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4416:   const PetscCount grid_size = gridDim.x * blockDim.x;
4417:   for (; i < nnz; i += grid_size) {
4418:     PetscScalar sum = 0.0;
4419:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4420:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4421:   }
4422: }

4424: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4425: {
4426:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4427:   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4428:   PetscCount           Annz = seq->nz;
4429:   PetscMemType         memtype;
4430:   const PetscScalar   *v1 = v;
4431:   PetscScalar         *Aa;
4432:   PetscContainer       container;
4433:   MatCOOStruct_SeqAIJ *coo;

4435:   PetscFunctionBegin;
4436:   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));

4438:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4439:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

4441:   PetscCall(PetscGetMemType(v, &memtype));
4442:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4443:     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4444:     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4445:   }

4447:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4448:   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));

4450:   PetscCall(PetscLogGpuTimeBegin());
4451:   if (Annz) {
4452:     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4453:     PetscCallCUDA(cudaPeekAtLastError());
4454:   }
4455:   PetscCall(PetscLogGpuTimeEnd());

4457:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4458:   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));

4460:   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4461:   PetscFunctionReturn(PETSC_SUCCESS);
4462: }

4464: /*@C
4465:   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.

4467:   Not Collective

4469:   Input Parameters:
4470: + A          - the matrix
4471: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4473:   Output Parameters:
4474: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4475: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`

4477:   Level: developer

4479:   Note:
4480:   When compressed is true, the CSR structure does not contain empty rows

4482: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4483: @*/
4484: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4485: {
4486:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4487:   CsrMatrix          *csr;
4488:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4490:   PetscFunctionBegin;
4492:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4493:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4494:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4495:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4496:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4497:   csr = (CsrMatrix *)cusp->mat->mat;
4498:   if (i) {
4499:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4500:       if (!cusp->rowoffsets_gpu) {
4501:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4502:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4503:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4504:       }
4505:       *i = cusp->rowoffsets_gpu->data().get();
4506:     } else *i = csr->row_offsets->data().get();
4507:   }
4508:   if (j) *j = csr->column_indices->data().get();
4509:   PetscFunctionReturn(PETSC_SUCCESS);
4510: }

4512: /*@C
4513:   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4515:   Not Collective

4517:   Input Parameters:
4518: + A          - the matrix
4519: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4520: . i          - the CSR row pointers
4521: - j          - the CSR column indices

4523:   Level: developer

4525: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4526: @*/
4527: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4528: {
4529:   PetscFunctionBegin;
4531:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532:   if (i) *i = NULL;
4533:   if (j) *j = NULL;
4534:   (void)compressed;
4535:   PetscFunctionReturn(PETSC_SUCCESS);
4536: }

4538: /*@C
4539:   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored

4541:   Not Collective

4543:   Input Parameter:
4544: . A - a `MATSEQAIJCUSPARSE` matrix

4546:   Output Parameter:
4547: . a - pointer to the device data

4549:   Level: developer

4551:   Note:
4552:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

4554: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4555: @*/
4556: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4557: {
4558:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4559:   CsrMatrix          *csr;

4561:   PetscFunctionBegin;
4563:   PetscAssertPointer(a, 2);
4564:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4565:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4566:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4567:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4568:   csr = (CsrMatrix *)cusp->mat->mat;
4569:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4570:   *a = csr->values->data().get();
4571:   PetscFunctionReturn(PETSC_SUCCESS);
4572: }

4574: /*@C
4575:   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4577:   Not Collective

4579:   Input Parameters:
4580: + A - a `MATSEQAIJCUSPARSE` matrix
4581: - a - pointer to the device data

4583:   Level: developer

4585: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4586: @*/
4587: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4588: {
4589:   PetscFunctionBegin;
4591:   PetscAssertPointer(a, 2);
4592:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4593:   *a = NULL;
4594:   PetscFunctionReturn(PETSC_SUCCESS);
4595: }

4597: /*@C
4598:   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4600:   Not Collective

4602:   Input Parameter:
4603: . A - a `MATSEQAIJCUSPARSE` matrix

4605:   Output Parameter:
4606: . a - pointer to the device data

4608:   Level: developer

4610:   Note:
4611:   Will trigger host-to-device copies if the most up-to-date matrix data is on the host

4613: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4614: @*/
4615: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4616: {
4617:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4618:   CsrMatrix          *csr;

4620:   PetscFunctionBegin;
4622:   PetscAssertPointer(a, 2);
4623:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4624:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4625:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4626:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4627:   csr = (CsrMatrix *)cusp->mat->mat;
4628:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4629:   *a             = csr->values->data().get();
4630:   A->offloadmask = PETSC_OFFLOAD_GPU;
4631:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4632:   PetscFunctionReturn(PETSC_SUCCESS);
4633: }
4634: /*@C
4635:   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4637:   Not Collective

4639:   Input Parameters:
4640: + A - a `MATSEQAIJCUSPARSE` matrix
4641: - a - pointer to the device data

4643:   Level: developer

4645: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4646: @*/
4647: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4648: {
4649:   PetscFunctionBegin;
4651:   PetscAssertPointer(a, 2);
4652:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4653:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4654:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4655:   *a = NULL;
4656:   PetscFunctionReturn(PETSC_SUCCESS);
4657: }

4659: /*@C
4660:   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4662:   Not Collective

4664:   Input Parameter:
4665: . A - a `MATSEQAIJCUSPARSE` matrix

4667:   Output Parameter:
4668: . a - pointer to the device data

4670:   Level: developer

4672:   Note:
4673:   Does not trigger any host to device copies.

4675:   It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current

4677: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4678: @*/
4679: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4680: {
4681:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4682:   CsrMatrix          *csr;

4684:   PetscFunctionBegin;
4686:   PetscAssertPointer(a, 2);
4687:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4688:   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4689:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4690:   csr = (CsrMatrix *)cusp->mat->mat;
4691:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4692:   *a             = csr->values->data().get();
4693:   A->offloadmask = PETSC_OFFLOAD_GPU;
4694:   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4695:   PetscFunctionReturn(PETSC_SUCCESS);
4696: }

4698: /*@C
4699:   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4701:   Not Collective

4703:   Input Parameters:
4704: + A - a `MATSEQAIJCUSPARSE` matrix
4705: - a - pointer to the device data

4707:   Level: developer

4709: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4710: @*/
4711: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4712: {
4713:   PetscFunctionBegin;
4715:   PetscAssertPointer(a, 2);
4716:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4717:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4718:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4719:   *a = NULL;
4720:   PetscFunctionReturn(PETSC_SUCCESS);
4721: }

4723: struct IJCompare4 {
4724:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4725:   {
4726:     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4727:     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4728:     return false;
4729:   }
4730: };

4732: struct Shift {
4733:   int _shift;

4735:   Shift(int shift) : _shift(shift) { }
4736:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4737: };

4739: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4740: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4741: {
4742:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4743:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4744:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4745:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4746:   PetscInt                      Annz, Bnnz;
4747:   cusparseStatus_t              stat;
4748:   PetscInt                      i, m, n, zero = 0;

4750:   PetscFunctionBegin;
4753:   PetscAssertPointer(C, 4);
4754:   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4755:   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4756:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4757:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4758:   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4759:   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4760:   if (reuse == MAT_INITIAL_MATRIX) {
4761:     m = A->rmap->n;
4762:     n = A->cmap->n + B->cmap->n;
4763:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4764:     PetscCall(MatSetSizes(*C, m, n, m, n));
4765:     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4766:     c                       = (Mat_SeqAIJ *)(*C)->data;
4767:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4768:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4769:     Ccsr                    = new CsrMatrix;
4770:     Cmat->cprowIndices      = NULL;
4771:     c->compressedrow.use    = PETSC_FALSE;
4772:     c->compressedrow.nrows  = 0;
4773:     c->compressedrow.i      = NULL;
4774:     c->compressedrow.rindex = NULL;
4775:     Ccusp->workVector       = NULL;
4776:     Ccusp->nrows            = m;
4777:     Ccusp->mat              = Cmat;
4778:     Ccusp->mat->mat         = Ccsr;
4779:     Ccsr->num_rows          = m;
4780:     Ccsr->num_cols          = n;
4781:     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4782:     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4783:     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4784:     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4785:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4786:     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4787:     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788:     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4789:     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4790:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4791:     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4792:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4793:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");

4795:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4796:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4797:     Annz                 = (PetscInt)Acsr->column_indices->size();
4798:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4799:     c->nz                = Annz + Bnnz;
4800:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4801:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4802:     Ccsr->values         = new THRUSTARRAY(c->nz);
4803:     Ccsr->num_entries    = c->nz;
4804:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4805:     if (c->nz) {
4806:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4807:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4808:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4809:       THRUSTINTARRAY32 *Aroff, *Broff;

4811:       if (a->compressedrow.use) { /* need full row offset */
4812:         if (!Acusp->rowoffsets_gpu) {
4813:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4814:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4815:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4816:         }
4817:         Aroff = Acusp->rowoffsets_gpu;
4818:       } else Aroff = Acsr->row_offsets;
4819:       if (b->compressedrow.use) { /* need full row offset */
4820:         if (!Bcusp->rowoffsets_gpu) {
4821:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4822:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4823:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4824:         }
4825:         Broff = Bcusp->rowoffsets_gpu;
4826:       } else Broff = Bcsr->row_offsets;
4827:       PetscCall(PetscLogGpuTimeBegin());
4828:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4829:       PetscCallCUSPARSE(stat);
4830:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4831:       PetscCallCUSPARSE(stat);
4832:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4833:       auto Aperm = thrust::make_constant_iterator(1);
4834:       auto Bperm = thrust::make_constant_iterator(0);
4835: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4836:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4837:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4838: #else
4839:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4840:       auto Bcib = Bcsr->column_indices->begin();
4841:       auto Bcie = Bcsr->column_indices->end();
4842:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4843: #endif
4844:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4845:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4846:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4847:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4848:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4849:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4850:       auto p1    = Ccusp->coords->begin();
4851:       auto p2    = Ccusp->coords->begin();
4852:       thrust::advance(p2, Annz);
4853:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4854: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4855:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4856: #endif
4857:       auto cci = thrust::make_counting_iterator(zero);
4858:       auto cce = thrust::make_counting_iterator(c->nz);
4859: #if 0 //Errors on SUMMIT cuda 11.1.0
4860:       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4861: #else
4862:   #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4863:       auto pred = thrust::identity<int>();
4864:   #else
4865:       auto pred = cuda::std::identity();
4866:   #endif
4867:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4868:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4869: #endif
4870:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4871:       PetscCallCUSPARSE(stat);
4872:       PetscCall(PetscLogGpuTimeEnd());
4873:       delete wPerm;
4874:       delete Acoo;
4875:       delete Bcoo;
4876:       delete Ccoo;
4877: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4878:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4879:       PetscCallCUSPARSE(stat);
4880: #endif
4881:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4882:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4883:         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4884:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4885:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4886:         CsrMatrix                    *CcsrT = new CsrMatrix;
4887:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4888:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4890:         (*C)->form_explicit_transpose = PETSC_TRUE;
4891:         (*C)->transupdated            = PETSC_TRUE;
4892:         Ccusp->rowoffsets_gpu         = NULL;
4893:         CmatT->cprowIndices           = NULL;
4894:         CmatT->mat                    = CcsrT;
4895:         CcsrT->num_rows               = n;
4896:         CcsrT->num_cols               = m;
4897:         CcsrT->num_entries            = c->nz;

4899:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4900:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4901:         CcsrT->values         = new THRUSTARRAY(c->nz);

4903:         PetscCall(PetscLogGpuTimeBegin());
4904:         auto rT = CcsrT->row_offsets->begin();
4905:         if (AT) {
4906:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4907:           thrust::advance(rT, -1);
4908:         }
4909:         if (BT) {
4910:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4911:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4912:           thrust::copy(titb, tite, rT);
4913:         }
4914:         auto cT = CcsrT->column_indices->begin();
4915:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4916:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4917:         auto vT = CcsrT->values->begin();
4918:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4919:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4920:         PetscCall(PetscLogGpuTimeEnd());

4922:         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4923:         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4924:         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4925:         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4926:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4927:         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4928:         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4929:         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4930:         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4931: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4932:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4933:         PetscCallCUSPARSE(stat);
4934: #endif
4935:         Ccusp->matTranspose = CmatT;
4936:       }
4937:     }

4939:     c->free_a = PETSC_TRUE;
4940:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4941:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4942:     c->free_ij = PETSC_TRUE;
4943:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4944:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4945:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4946:       ii = *Ccsr->row_offsets;
4947:       jj = *Ccsr->column_indices;
4948:       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4949:       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4950:     } else {
4951:       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4952:       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4953:     }
4954:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4955:     PetscCall(PetscMalloc1(m, &c->ilen));
4956:     PetscCall(PetscMalloc1(m, &c->imax));
4957:     c->maxnz         = c->nz;
4958:     c->nonzerorowcnt = 0;
4959:     c->rmax          = 0;
4960:     for (i = 0; i < m; i++) {
4961:       const PetscInt nn = c->i[i + 1] - c->i[i];
4962:       c->ilen[i] = c->imax[i] = nn;
4963:       c->nonzerorowcnt += (PetscInt)!!nn;
4964:       c->rmax = PetscMax(c->rmax, nn);
4965:     }
4966:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4967:     PetscCall(PetscMalloc1(c->nz, &c->a));
4968:     (*C)->nonzerostate++;
4969:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4970:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4971:     Ccusp->nonzerostate = (*C)->nonzerostate;
4972:     (*C)->preallocated  = PETSC_TRUE;
4973:   } else {
4974:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4975:     c = (Mat_SeqAIJ *)(*C)->data;
4976:     if (c->nz) {
4977:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4978:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4979:       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4980:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4981:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4982:       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4983:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4984:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4985:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4986:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4987:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4988:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4989:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4990:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4991:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4992:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4993:       auto pmid = Ccusp->coords->begin();
4994:       thrust::advance(pmid, Acsr->num_entries);
4995:       PetscCall(PetscLogGpuTimeBegin());
4996:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4997:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4998:       thrust::for_each(zibait, zieait, VecCUDAEquals());
4999:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5000:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5001:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5002:       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5003:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5004:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5005:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5006:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5007:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5008:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5009:         auto       vT    = CcsrT->values->begin();
5010:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5011:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5012:         (*C)->transupdated = PETSC_TRUE;
5013:       }
5014:       PetscCall(PetscLogGpuTimeEnd());
5015:     }
5016:   }
5017:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5018:   (*C)->assembled     = PETSC_TRUE;
5019:   (*C)->was_assembled = PETSC_FALSE;
5020:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5021:   PetscFunctionReturn(PETSC_SUCCESS);
5022: }

5024: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5025: {
5026:   bool               dmem;
5027:   const PetscScalar *av;

5029:   PetscFunctionBegin;
5030:   dmem = isCudaMem(v);
5031:   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5032:   if (n && idx) {
5033:     THRUSTINTARRAY widx(n);
5034:     widx.assign(idx, idx + n);
5035:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

5037:     THRUSTARRAY                    *w = NULL;
5038:     thrust::device_ptr<PetscScalar> dv;
5039:     if (dmem) {
5040:       dv = thrust::device_pointer_cast(v);
5041:     } else {
5042:       w  = new THRUSTARRAY(n);
5043:       dv = w->data();
5044:     }
5045:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

5047:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5048:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5049:     thrust::for_each(zibit, zieit, VecCUDAEquals());
5050:     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5051:     delete w;
5052:   } else {
5053:     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5054:   }
5055:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5056:   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5057:   PetscFunctionReturn(PETSC_SUCCESS);
5058: }