Actual source code: aijcusparse.cu

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  8: #include <petscconf.h>
  9: #include <../src/mat/impls/aij/seq/aij.h>
 10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
 15: #include <thrust/adjacent_difference.h>
 16: #if PETSC_CPP_VERSION >= 14
 17:   #define PETSC_HAVE_THRUST_ASYNC 1
 18:   // thrust::for_each(thrust::cuda::par.on()) requires C++14
 19:   #include <thrust/async/for_each.h>
 20: #endif
 21: #include <thrust/iterator/constant_iterator.h>
 22: #include <thrust/remove.h>
 23: #include <thrust/sort.h>
 24: #include <thrust/unique.h>

 26: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
 27: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
 28: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 29:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 31:   typedef enum {
 32:       CUSPARSE_MV_ALG_DEFAULT = 0,
 33:       CUSPARSE_COOMV_ALG      = 1,
 34:       CUSPARSE_CSRMV_ALG1     = 2,
 35:       CUSPARSE_CSRMV_ALG2     = 3
 36:   } cusparseSpMVAlg_t;

 38:   typedef enum {
 39:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 40:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 41:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 42:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 43:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 44:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 45:       CUSPARSE_SPMM_COO_ALG1    = 1,
 46:       CUSPARSE_SPMM_COO_ALG2    = 2,
 47:       CUSPARSE_SPMM_COO_ALG3    = 3,
 48:       CUSPARSE_SPMM_COO_ALG4    = 5,
 49:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 50:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 51:   } cusparseSpMMAlg_t;

 53:   typedef enum {
 54:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 55:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 56:   } cusparseCsr2CscAlg_t;
 57:   */
 58: const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
 59: const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
 60: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
 61: #endif

 63: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 64: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 65: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);

 67: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 68: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 69: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);

 71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
 72: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 74: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
 75: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 76: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
 77: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
 78: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
 79: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 80: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 81: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 82: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
 83: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
 84: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 86: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
 88: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
 89: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
 90: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);

 92: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
 93: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);

 95: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 96: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 97: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);

 99: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100: {
101:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

103:   switch (op) {
104:   case MAT_CUSPARSE_MULT:
105:     cusparsestruct->format = format;
106:     break;
107:   case MAT_CUSPARSE_ALL:
108:     cusparsestruct->format = format;
109:     break;
110:   default:
111:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112:   }
113:   return 0;
114: }

116: /*@
117:    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118:    operation. Only the `MatMult()` operation can use different GPU storage formats

120:    Not Collective

122:    Input Parameters:
123: +  A - Matrix of type `MATSEQAIJCUSPARSE`
124: .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
125:         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)

128:    Output Parameter:

130:    Level: intermediate

132: .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133: @*/
134: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
135: {
137:   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
138:   return 0;
139: }

141: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142: {
143:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

145:   cusparsestruct->use_cpu_solve = use_cpu;
146:   return 0;
147: }

149: /*@
150:    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.

152:    Input Parameters:
153: +  A - Matrix of type `MATSEQAIJCUSPARSE`
154: -  use_cpu - set flag for using the built-in CPU `MatSolve()`

156:    Output Parameter:

158:    Note:
159:    The cuSparse LU solver currently computes the factors with the built-in CPU method
160:    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161:    This method to specify if the solve is done on the CPU or GPU (GPU is the default).

163:    Level: intermediate

165: .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
166: @*/
167: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
168: {
170:   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
171:   return 0;
172: }

174: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175: {
176:   switch (op) {
177:   case MAT_FORM_EXPLICIT_TRANSPOSE:
178:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179:     if (A->form_explicit_transpose && !flg) MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
180:     A->form_explicit_transpose = flg;
181:     break;
182:   default:
183:     MatSetOption_SeqAIJ(A, op, flg);
184:     break;
185:   }
186:   return 0;
187: }

189: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);

191: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192: {
193:   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
194:   IS                  isrow = b->row, iscol = b->col;
195:   PetscBool           row_identity, col_identity;
196:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;

198:   MatSeqAIJCUSPARSECopyFromGPU(A);
199:   MatLUFactorNumeric_SeqAIJ(B, A, info);
200:   B->offloadmask = PETSC_OFFLOAD_CPU;
201:   /* determine which version of MatSolve needs to be used. */
202:   ISIdentity(isrow, &row_identity);
203:   ISIdentity(iscol, &col_identity);

205:   if (!cusparsestruct->use_cpu_solve) {
206:     if (row_identity && col_identity) {
207:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
208:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
209:     } else {
210:       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
211:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
212:     }
213:   }
214:   B->ops->matsolve          = NULL;
215:   B->ops->matsolvetranspose = NULL;

217:   /* get the triangular factors */
218:   if (!cusparsestruct->use_cpu_solve) MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
219:   return 0;
220: }

222: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
223: {
224:   MatCUSPARSEStorageFormat format;
225:   PetscBool                flg;
226:   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;

228:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
229:   if (A->factortype == MAT_FACTOR_NONE) {
230:     PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg);
231:     if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format);

233:     PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg);
234:     if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format);
235:     PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg);
236:     if (flg) MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve);
237: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
238:     PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg);
239:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
240:   #if CUSPARSE_VERSION > 11301
242:   #else
244:   #endif
245:     PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg);

248:     PetscCall(
249:       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
251: #endif
252:   }
253:   PetscOptionsHeadEnd();
254:   return 0;
255: }

257: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
258: {
259:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
260:   PetscInt                           n                  = A->rmap->n;
261:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
262:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
263:   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
264:   const MatScalar                   *aa = a->a, *v;
265:   PetscInt                          *AiLo, *AjLo;
266:   PetscInt                           i, nz, nzLower, offset, rowOffset;

268:   if (!n) return 0;
269:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
270:     try {
271:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272:       nzLower = n + ai[n] - ai[1];
273:       if (!loTriFactor) {
274:         PetscScalar *AALo;

276:         cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar));

278:         /* Allocate Space for the lower triangular matrix */
279:         cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt));
280:         cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt));

282:         /* Fill the lower triangular matrix */
283:         AiLo[0]   = (PetscInt)0;
284:         AiLo[n]   = nzLower;
285:         AjLo[0]   = (PetscInt)0;
286:         AALo[0]   = (MatScalar)1.0;
287:         v         = aa;
288:         vi        = aj;
289:         offset    = 1;
290:         rowOffset = 1;
291:         for (i = 1; i < n; i++) {
292:           nz = ai[i + 1] - ai[i];
293:           /* additional 1 for the term on the diagonal */
294:           AiLo[i] = rowOffset;
295:           rowOffset += nz + 1;

297:           PetscArraycpy(&(AjLo[offset]), vi, nz);
298:           PetscArraycpy(&(AALo[offset]), v, nz);

300:           offset += nz;
301:           AjLo[offset] = (PetscInt)i;
302:           AALo[offset] = (MatScalar)1.0;
303:           offset += 1;

305:           v += nz;
306:           vi += nz;
307:         }

309:         /* allocate space for the triangular factor information */
310:         PetscNew(&loTriFactor);
311:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
312:         /* Create the matrix description */
313:         cusparseCreateMatDescr(&loTriFactor->descr);
314:         cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
315: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
316:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
317: #else
318:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
319: #endif
320:         cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);
321:         cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);

323:         /* set the operation */
324:         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

326:         /* set the matrix */
327:         loTriFactor->csrMat              = new CsrMatrix;
328:         loTriFactor->csrMat->num_rows    = n;
329:         loTriFactor->csrMat->num_cols    = n;
330:         loTriFactor->csrMat->num_entries = nzLower;

332:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
333:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);

335:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
336:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);

338:         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
339:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

341:         /* Create the solve analysis information */
342:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
343:         cusparseCreateCsrsvInfo(&loTriFactor->solveInfo);
344: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
345:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
347:         cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
348: #endif

350:         /* perform the solve analysis */
351:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
352:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
353:         WaitForCUDA();
354:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

356:         /* assign the pointer */
357:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
358:         loTriFactor->AA_h                                          = AALo;
359:         cudaFreeHost(AiLo);
360:         cudaFreeHost(AjLo);
361:         PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar));
362:       } else { /* update values only */
363:         if (!loTriFactor->AA_h) cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar));
364:         /* Fill the lower triangular matrix */
365:         loTriFactor->AA_h[0] = 1.0;
366:         v                    = aa;
367:         vi                   = aj;
368:         offset               = 1;
369:         for (i = 1; i < n; i++) {
370:           nz = ai[i + 1] - ai[i];
371:           PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
372:           offset += nz;
373:           loTriFactor->AA_h[offset] = 1.0;
374:           offset += 1;
375:           v += nz;
376:         }
377:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
378:         PetscLogCpuToGpu(nzLower * sizeof(PetscScalar));
379:       }
380:     } catch (char *ex) {
381:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
382:     }
383:   }
384:   return 0;
385: }

387: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
388: {
389:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
390:   PetscInt                           n                  = A->rmap->n;
391:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
392:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
393:   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
394:   const MatScalar                   *aa = a->a, *v;
395:   PetscInt                          *AiUp, *AjUp;
396:   PetscInt                           i, nz, nzUpper, offset;

398:   if (!n) return 0;
399:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
400:     try {
401:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
402:       nzUpper = adiag[0] - adiag[n];
403:       if (!upTriFactor) {
404:         PetscScalar *AAUp;

406:         cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar));

408:         /* Allocate Space for the upper triangular matrix */
409:         cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt));
410:         cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt));

412:         /* Fill the upper triangular matrix */
413:         AiUp[0] = (PetscInt)0;
414:         AiUp[n] = nzUpper;
415:         offset  = nzUpper;
416:         for (i = n - 1; i >= 0; i--) {
417:           v  = aa + adiag[i + 1] + 1;
418:           vi = aj + adiag[i + 1] + 1;

420:           /* number of elements NOT on the diagonal */
421:           nz = adiag[i] - adiag[i + 1] - 1;

423:           /* decrement the offset */
424:           offset -= (nz + 1);

426:           /* first, set the diagonal elements */
427:           AjUp[offset] = (PetscInt)i;
428:           AAUp[offset] = (MatScalar)1. / v[nz];
429:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

431:           PetscArraycpy(&(AjUp[offset + 1]), vi, nz);
432:           PetscArraycpy(&(AAUp[offset + 1]), v, nz);
433:         }

435:         /* allocate space for the triangular factor information */
436:         PetscNew(&upTriFactor);
437:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

439:         /* Create the matrix description */
440:         cusparseCreateMatDescr(&upTriFactor->descr);
441:         cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
442: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
443:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
444: #else
445:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
446: #endif
447:         cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
448:         cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);

450:         /* set the operation */
451:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

453:         /* set the matrix */
454:         upTriFactor->csrMat              = new CsrMatrix;
455:         upTriFactor->csrMat->num_rows    = n;
456:         upTriFactor->csrMat->num_cols    = n;
457:         upTriFactor->csrMat->num_entries = nzUpper;

459:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
460:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);

462:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
463:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);

465:         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
466:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

468:         /* Create the solve analysis information */
469:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
470:         cusparseCreateCsrsvInfo(&upTriFactor->solveInfo);
471: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
472:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
473:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
474:         cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
475: #endif

477:         /* perform the solve analysis */
478:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
479:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

481:         WaitForCUDA();
482:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

484:         /* assign the pointer */
485:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
486:         upTriFactor->AA_h                                          = AAUp;
487:         cudaFreeHost(AiUp);
488:         cudaFreeHost(AjUp);
489:         PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar));
490:       } else {
491:         if (!upTriFactor->AA_h) cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar));
492:         /* Fill the upper triangular matrix */
493:         offset = nzUpper;
494:         for (i = n - 1; i >= 0; i--) {
495:           v = aa + adiag[i + 1] + 1;

497:           /* number of elements NOT on the diagonal */
498:           nz = adiag[i] - adiag[i + 1] - 1;

500:           /* decrement the offset */
501:           offset -= (nz + 1);

503:           /* first, set the diagonal elements */
504:           upTriFactor->AA_h[offset] = 1. / v[nz];
505:           PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz);
506:         }
507:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
508:         PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar));
509:       }
510:     } catch (char *ex) {
511:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
512:     }
513:   }
514:   return 0;
515: }

517: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
518: {
519:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
520:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
521:   IS                            isrow = a->row, iscol = a->icol;
522:   PetscBool                     row_identity, col_identity;
523:   PetscInt                      n = A->rmap->n;

526:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
527:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

529:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
530:   cusparseTriFactors->nnz = a->nz;

532:   A->offloadmask = PETSC_OFFLOAD_BOTH;
533:   /* lower triangular indices */
534:   ISIdentity(isrow, &row_identity);
535:   if (!row_identity && !cusparseTriFactors->rpermIndices) {
536:     const PetscInt *r;

538:     ISGetIndices(isrow, &r);
539:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
540:     cusparseTriFactors->rpermIndices->assign(r, r + n);
541:     ISRestoreIndices(isrow, &r);
542:     PetscLogCpuToGpu(n * sizeof(PetscInt));
543:   }

545:   /* upper triangular indices */
546:   ISIdentity(iscol, &col_identity);
547:   if (!col_identity && !cusparseTriFactors->cpermIndices) {
548:     const PetscInt *c;

550:     ISGetIndices(iscol, &c);
551:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
552:     cusparseTriFactors->cpermIndices->assign(c, c + n);
553:     ISRestoreIndices(iscol, &c);
554:     PetscLogCpuToGpu(n * sizeof(PetscInt));
555:   }
556:   return 0;
557: }

559: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
560: {
561:   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
562:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
563:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
564:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
565:   PetscInt                          *AiUp, *AjUp;
566:   PetscScalar                       *AAUp;
567:   PetscScalar                       *AALo;
568:   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
569:   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
570:   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
571:   const MatScalar                   *aa = b->a, *v;

573:   if (!n) return 0;
574:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
575:     try {
576:       cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar));
577:       cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar));
578:       if (!upTriFactor && !loTriFactor) {
579:         /* Allocate Space for the upper triangular matrix */
580:         cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt));
581:         cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt));

583:         /* Fill the upper triangular matrix */
584:         AiUp[0] = (PetscInt)0;
585:         AiUp[n] = nzUpper;
586:         offset  = 0;
587:         for (i = 0; i < n; i++) {
588:           /* set the pointers */
589:           v  = aa + ai[i];
590:           vj = aj + ai[i];
591:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

593:           /* first, set the diagonal elements */
594:           AjUp[offset] = (PetscInt)i;
595:           AAUp[offset] = (MatScalar)1.0 / v[nz];
596:           AiUp[i]      = offset;
597:           AALo[offset] = (MatScalar)1.0 / v[nz];

599:           offset += 1;
600:           if (nz > 0) {
601:             PetscArraycpy(&(AjUp[offset]), vj, nz);
602:             PetscArraycpy(&(AAUp[offset]), v, nz);
603:             for (j = offset; j < offset + nz; j++) {
604:               AAUp[j] = -AAUp[j];
605:               AALo[j] = AAUp[j] / v[nz];
606:             }
607:             offset += nz;
608:           }
609:         }

611:         /* allocate space for the triangular factor information */
612:         PetscNew(&upTriFactor);
613:         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

615:         /* Create the matrix description */
616:         cusparseCreateMatDescr(&upTriFactor->descr);
617:         cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
618: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
619:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
620: #else
621:         cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
622: #endif
623:         cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
624:         cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);

626:         /* set the matrix */
627:         upTriFactor->csrMat              = new CsrMatrix;
628:         upTriFactor->csrMat->num_rows    = A->rmap->n;
629:         upTriFactor->csrMat->num_cols    = A->cmap->n;
630:         upTriFactor->csrMat->num_entries = a->nz;

632:         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
633:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

635:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
636:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

638:         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
639:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

641:         /* set the operation */
642:         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

644:         /* Create the solve analysis information */
645:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
646:         cusparseCreateCsrsvInfo(&upTriFactor->solveInfo);
647: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
648:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
649:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
650:         cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
651: #endif

653:         /* perform the solve analysis */
654:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

657:         WaitForCUDA();
658:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

660:         /* assign the pointer */
661:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

663:         /* allocate space for the triangular factor information */
664:         PetscNew(&loTriFactor);
665:         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

667:         /* Create the matrix description */
668:         cusparseCreateMatDescr(&loTriFactor->descr);
669:         cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
670: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
671:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
672: #else
673:         cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
674: #endif
675:         cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
676:         cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);

678:         /* set the operation */
679:         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

681:         /* set the matrix */
682:         loTriFactor->csrMat              = new CsrMatrix;
683:         loTriFactor->csrMat->num_rows    = A->rmap->n;
684:         loTriFactor->csrMat->num_cols    = A->cmap->n;
685:         loTriFactor->csrMat->num_entries = a->nz;

687:         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
688:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);

690:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);

693:         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

696:         /* Create the solve analysis information */
697:         PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
698:         cusparseCreateCsrsvInfo(&loTriFactor->solveInfo);
699: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
700:         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
701:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
702:         cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
703: #endif

705:         /* perform the solve analysis */
706:         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

709:         WaitForCUDA();
710:         PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

712:         /* assign the pointer */
713:         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

715:         PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)));
716:         cudaFreeHost(AiUp);
717:         cudaFreeHost(AjUp);
718:       } else {
719:         /* Fill the upper triangular matrix */
720:         offset = 0;
721:         for (i = 0; i < n; i++) {
722:           /* set the pointers */
723:           v  = aa + ai[i];
724:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

726:           /* first, set the diagonal elements */
727:           AAUp[offset] = 1.0 / v[nz];
728:           AALo[offset] = 1.0 / v[nz];

730:           offset += 1;
731:           if (nz > 0) {
732:             PetscArraycpy(&(AAUp[offset]), v, nz);
733:             for (j = offset; j < offset + nz; j++) {
734:               AAUp[j] = -AAUp[j];
735:               AALo[j] = AAUp[j] / v[nz];
736:             }
737:             offset += nz;
738:           }
739:         }
742:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
743:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
744:         PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar));
745:       }
746:       cudaFreeHost(AAUp);
747:       cudaFreeHost(AALo);
748:     } catch (char *ex) {
749:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
750:     }
751:   }
752:   return 0;
753: }

755: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
756: {
757:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
758:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
759:   IS                            ip                 = a->row;
760:   PetscBool                     perm_identity;
761:   PetscInt                      n = A->rmap->n;

764:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
765:   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
766:   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;

768:   A->offloadmask = PETSC_OFFLOAD_BOTH;

770:   /* lower triangular indices */
771:   ISIdentity(ip, &perm_identity);
772:   if (!perm_identity) {
773:     IS              iip;
774:     const PetscInt *irip, *rip;

776:     ISInvertPermutation(ip, PETSC_DECIDE, &iip);
777:     ISGetIndices(iip, &irip);
778:     ISGetIndices(ip, &rip);
779:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
780:     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
781:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
782:     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
783:     ISRestoreIndices(iip, &irip);
784:     ISDestroy(&iip);
785:     ISRestoreIndices(ip, &rip);
786:     PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
787:   }
788:   return 0;
789: }

791: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
792: {
793:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
794:   IS          ip = b->row;
795:   PetscBool   perm_identity;

797:   MatSeqAIJCUSPARSECopyFromGPU(A);
798:   MatCholeskyFactorNumeric_SeqAIJ(B, A, info);
799:   B->offloadmask = PETSC_OFFLOAD_CPU;
800:   /* determine which version of MatSolve needs to be used. */
801:   ISIdentity(ip, &perm_identity);
802:   if (perm_identity) {
803:     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
804:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
805:     B->ops->matsolve          = NULL;
806:     B->ops->matsolvetranspose = NULL;
807:   } else {
808:     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
809:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
810:     B->ops->matsolve          = NULL;
811:     B->ops->matsolvetranspose = NULL;
812:   }

814:   /* get the triangular factors */
815:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
816:   return 0;
817: }

819: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
820: {
821:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
822:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
823:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
824:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
825:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
826:   cusparseIndexBase_t                indexBase;
827:   cusparseMatrixType_t               matrixType;
828:   cusparseFillMode_t                 fillMode;
829:   cusparseDiagType_t                 diagType;

831:   /* allocate space for the transpose of the lower triangular factor */
832:   PetscNew(&loTriFactorT);
833:   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

835:   /* set the matrix descriptors of the lower triangular factor */
836:   matrixType = cusparseGetMatType(loTriFactor->descr);
837:   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
838:   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
839:   diagType   = cusparseGetMatDiagType(loTriFactor->descr);

841:   /* Create the matrix description */
842:   cusparseCreateMatDescr(&loTriFactorT->descr);
843:   cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);
844:   cusparseSetMatType(loTriFactorT->descr, matrixType);
845:   cusparseSetMatFillMode(loTriFactorT->descr, fillMode);
846:   cusparseSetMatDiagType(loTriFactorT->descr, diagType);

848:   /* set the operation */
849:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

851:   /* allocate GPU space for the CSC of the lower triangular factor*/
852:   loTriFactorT->csrMat                 = new CsrMatrix;
853:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
854:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
855:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
856:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
857:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
858:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

860:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
861: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
862:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
863:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
864:                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
865:   cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize);
866: #endif

868:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
869:   {
870:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
871:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872:                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
873: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
874:                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
875: #else
876:                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
877: #endif
878:     stat;
879:   }

881:   WaitForCUDA();
882:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);

884:   /* Create the solve analysis information */
885:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
886:   cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo);
887: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
888:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
889:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
890:   cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize);
891: #endif

893:   /* perform the solve analysis */
894:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
895:                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

897:   WaitForCUDA();
898:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

900:   /* assign the pointer */
901:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

903:   /*********************************************/
904:   /* Now the Transpose of the Upper Tri Factor */
905:   /*********************************************/

907:   /* allocate space for the transpose of the upper triangular factor */
908:   PetscNew(&upTriFactorT);
909:   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

911:   /* set the matrix descriptors of the upper triangular factor */
912:   matrixType = cusparseGetMatType(upTriFactor->descr);
913:   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
914:   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915:   diagType   = cusparseGetMatDiagType(upTriFactor->descr);

917:   /* Create the matrix description */
918:   cusparseCreateMatDescr(&upTriFactorT->descr);
919:   cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);
920:   cusparseSetMatType(upTriFactorT->descr, matrixType);
921:   cusparseSetMatFillMode(upTriFactorT->descr, fillMode);
922:   cusparseSetMatDiagType(upTriFactorT->descr, diagType);

924:   /* set the operation */
925:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

927:   /* allocate GPU space for the CSC of the upper triangular factor*/
928:   upTriFactorT->csrMat                 = new CsrMatrix;
929:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
930:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
931:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
932:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
933:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
934:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

936:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
937: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
938:   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
939:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
940:                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
941:   cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize);
942: #endif

944:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
945:   {
946:     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
947:     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
948:                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
949: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
950:                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
951: #else
952:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
953: #endif
954:     stat;
955:   }

957:   WaitForCUDA();
958:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);

960:   /* Create the solve analysis information */
961:   PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
962:   cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo);
963: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
964:   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
965:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
966:   cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize);
967: #endif

969:   /* perform the solve analysis */
970:   /* christ, would it have killed you to put this stuff in a function????????? */
971:   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
972:                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

974:   WaitForCUDA();
975:   PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);

977:   /* assign the pointer */
978:   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
979:   return 0;
980: }

982: struct PetscScalarToPetscInt {
983:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
984: };

986: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
987: {
988:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
989:   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
990:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
991:   cusparseStatus_t              stat;
992:   cusparseIndexBase_t           indexBase;

994:   MatSeqAIJCUSPARSECopyToGPU(A);
995:   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
997:   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
999:   if (A->transupdated) return 0;
1000:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
1001:   PetscLogGpuTimeBegin();
1002:   if (cusparsestruct->format != MAT_CUSPARSE_CSR) MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
1003:   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1004:     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1005:     cusparseCreateMatDescr(&matstructT->descr);
1006:     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1007:     cusparseSetMatIndexBase(matstructT->descr, indexBase);
1008:     cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);

1010:     /* set alpha and beta */
1011:     cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));
1012:     cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar));
1013:     cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
1014:     cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
1015:     cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
1016:     cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);

1018:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1019:       CsrMatrix *matrixT      = new CsrMatrix;
1020:       matstructT->mat         = matrixT;
1021:       matrixT->num_rows       = A->cmap->n;
1022:       matrixT->num_cols       = A->rmap->n;
1023:       matrixT->num_entries    = a->nz;
1024:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1025:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1026:       matrixT->values         = new THRUSTARRAY(a->nz);

1028:       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1029:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

1031: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1032:   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1033:       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1034:                                indexBase, cusparse_scalartype);
1035:       stat;
1036:   #else
1037:       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1038:            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1

1040:            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1041:            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1042:            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1043:         */
1044:       if (matrixT->num_entries) {
1045:         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1046:         stat;

1048:       } else {
1049:         matstructT->matDescr = NULL;
1050:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1051:       }
1052:   #endif
1053: #endif
1054:     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1055: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1056:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1057: #else
1058:       CsrMatrix *temp = new CsrMatrix;
1059:       CsrMatrix *tempT = new CsrMatrix;
1060:       /* First convert HYB to CSR */
1061:       temp->num_rows = A->rmap->n;
1062:       temp->num_cols = A->cmap->n;
1063:       temp->num_entries = a->nz;
1064:       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1065:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1066:       temp->values = new THRUSTARRAY(a->nz);

1068:       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1069:       stat;

1071:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1072:       tempT->num_rows = A->rmap->n;
1073:       tempT->num_cols = A->cmap->n;
1074:       tempT->num_entries = a->nz;
1075:       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1076:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1077:       tempT->values = new THRUSTARRAY(a->nz);

1079:       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1080:                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1081:       stat;

1083:       /* Last, convert CSC to HYB */
1084:       cusparseHybMat_t hybMat;
1085:       cusparseCreateHybMat(&hybMat);
1086:       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1087:       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1088:       stat;

1090:       /* assign the pointer */
1091:       matstructT->mat = hybMat;
1092:       A->transupdated = PETSC_TRUE;
1093:       /* delete temporaries */
1094:       if (tempT) {
1095:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1096:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1097:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1098:         delete (CsrMatrix *)tempT;
1099:       }
1100:       if (temp) {
1101:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1102:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1103:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1104:         delete (CsrMatrix *)temp;
1105:       }
1106: #endif
1107:     }
1108:   }
1109:   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1110:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1111:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1120:     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1121:       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1122:       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1123:       PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
1124:     }
1125:     if (!cusparsestruct->csr2csc_i) {
1126:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1127:       thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0);

1129:       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1130: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1131:       void  *csr2cscBuffer;
1132:       size_t csr2cscBufferSize;
1133:       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1134:                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1135:       stat;
1136:       cudaMalloc(&csr2cscBuffer, csr2cscBufferSize);
1137: #endif

1139:       if (matrix->num_entries) {
1140:         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1141:            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1142:            I checked every parameters and they were just fine. I have no clue why cusparse complains.

1144:            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1145:            should be filled with indexBase. So I just take a shortcut here.
1146:         */
1147:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1148: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1149:                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1150:         stat;
1151: #else
1152:                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1153:         stat;
1154: #endif
1155:       } else {
1156:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1157:       }

1159:       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1160:       thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt());
1161: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1162:       cudaFree(csr2cscBuffer);
1163: #endif
1164:     }
1165:     PetscCallThrust(
1166:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1167:   }
1168:   PetscLogGpuTimeEnd();
1169:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
1170:   /* the compressed row indices is not used for matTranspose */
1171:   matstructT->cprowIndices = NULL;
1172:   /* assign the pointer */
1173:   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1174:   A->transupdated                                = PETSC_TRUE;
1175:   return 0;
1176: }

1178: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1179: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1180: {
1181:   PetscInt                              n = xx->map->n;
1182:   const PetscScalar                    *barray;
1183:   PetscScalar                          *xarray;
1184:   thrust::device_ptr<const PetscScalar> bGPU;
1185:   thrust::device_ptr<PetscScalar>       xGPU;
1186:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1187:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1188:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1189:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1191:   /* Analyze the matrix and create the transpose ... on the fly */
1192:   if (!loTriFactorT && !upTriFactorT) {
1193:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1194:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1195:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1196:   }

1198:   /* Get the GPU pointers */
1199:   VecCUDAGetArrayWrite(xx, &xarray);
1200:   VecCUDAGetArrayRead(bb, &barray);
1201:   xGPU = thrust::device_pointer_cast(xarray);
1202:   bGPU = thrust::device_pointer_cast(barray);

1204:   PetscLogGpuTimeBegin();
1205:   /* First, reorder with the row permutation */
1206:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);

1208:   /* First, solve U */
1209:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1210:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1212:   /* Then, solve L */
1213:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1214:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1216:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1217:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());

1219:   /* Copy the temporary to the full solution. */
1220:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);

1222:   /* restore */
1223:   VecCUDARestoreArrayRead(bb, &barray);
1224:   VecCUDARestoreArrayWrite(xx, &xarray);
1225:   PetscLogGpuTimeEnd();
1226:   PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1227:   return 0;
1228: }

1230: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1231: {
1232:   const PetscScalar                 *barray;
1233:   PetscScalar                       *xarray;
1234:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1235:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1236:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1237:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1239:   /* Analyze the matrix and create the transpose ... on the fly */
1240:   if (!loTriFactorT && !upTriFactorT) {
1241:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1242:     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1243:     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1244:   }

1246:   /* Get the GPU pointers */
1247:   VecCUDAGetArrayWrite(xx, &xarray);
1248:   VecCUDAGetArrayRead(bb, &barray);

1250:   PetscLogGpuTimeBegin();
1251:   /* First, solve U */
1252:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1253:                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1255:   /* Then, solve L */
1256:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1257:                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1259:   /* restore */
1260:   VecCUDARestoreArrayRead(bb, &barray);
1261:   VecCUDARestoreArrayWrite(xx, &xarray);
1262:   PetscLogGpuTimeEnd();
1263:   PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1264:   return 0;
1265: }

1267: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1268: {
1269:   const PetscScalar                    *barray;
1270:   PetscScalar                          *xarray;
1271:   thrust::device_ptr<const PetscScalar> bGPU;
1272:   thrust::device_ptr<PetscScalar>       xGPU;
1273:   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1274:   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1275:   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1276:   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1278:   /* Get the GPU pointers */
1279:   VecCUDAGetArrayWrite(xx, &xarray);
1280:   VecCUDAGetArrayRead(bb, &barray);
1281:   xGPU = thrust::device_pointer_cast(xarray);
1282:   bGPU = thrust::device_pointer_cast(barray);

1284:   PetscLogGpuTimeBegin();
1285:   /* First, reorder with the row permutation */
1286:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());

1288:   /* Next, solve L */
1289:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1290:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1292:   /* Then, solve U */
1293:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1294:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1296:   /* Last, reorder with the column permutation */
1297:   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);

1299:   VecCUDARestoreArrayRead(bb, &barray);
1300:   VecCUDARestoreArrayWrite(xx, &xarray);
1301:   PetscLogGpuTimeEnd();
1302:   PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1303:   return 0;
1304: }

1306: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1307: {
1308:   const PetscScalar                 *barray;
1309:   PetscScalar                       *xarray;
1310:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1311:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1312:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1313:   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;

1315:   /* Get the GPU pointers */
1316:   VecCUDAGetArrayWrite(xx, &xarray);
1317:   VecCUDAGetArrayRead(bb, &barray);

1319:   PetscLogGpuTimeBegin();
1320:   /* First, solve L */
1321:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1322:                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1324:   /* Next, solve U */
1325:   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1326:                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1328:   VecCUDARestoreArrayRead(bb, &barray);
1329:   VecCUDARestoreArrayWrite(xx, &xarray);
1330:   PetscLogGpuTimeEnd();
1331:   PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1332:   return 0;
1333: }

1335: #if CUSPARSE_VERSION >= 11500
1336: /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1337: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1338: {
1339:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1340:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1341:   const PetscScalar            *barray;
1342:   PetscScalar                  *xarray;

1344:   VecCUDAGetArrayWrite(x, &xarray);
1345:   VecCUDAGetArrayRead(b, &barray);
1346:   PetscLogGpuTimeBegin();

1348:   /* Solve L*y = b */
1349:   cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1350:   cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1351:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1352:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1353:                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!

1355:   /* Solve U*x = y */
1356:   cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1357:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1358:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));

1360:   VecCUDARestoreArrayRead(b, &barray);
1361:   VecCUDARestoreArrayWrite(x, &xarray);

1363:   PetscLogGpuTimeEnd();
1364:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1365:   return 0;
1366: }

1368: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1369: {
1370:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1371:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1372:   const PetscScalar            *barray;
1373:   PetscScalar                  *xarray;

1375:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1376:     cusparseSpSV_createDescr(&fs->spsvDescr_Lt);
1377:     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1378:                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1380:     cusparseSpSV_createDescr(&fs->spsvDescr_Ut);
1381:     cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut);
1382:     cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1383:     cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut);
1384:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1385:   }

1387:   if (!fs->updatedTransposeSpSVAnalysis) {
1388:     cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);

1390:     cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut);
1391:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1392:   }

1394:   VecCUDAGetArrayWrite(x, &xarray);
1395:   VecCUDAGetArrayRead(b, &barray);
1396:   PetscLogGpuTimeBegin();

1398:   /* Solve Ut*y = b */
1399:   cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1400:   cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1401:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1402:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));

1404:   /* Solve Lt*x = y */
1405:   cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1406:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1407:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1409:   VecCUDARestoreArrayRead(b, &barray);
1410:   VecCUDARestoreArrayWrite(x, &xarray);
1411:   PetscLogGpuTimeEnd();
1412:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1413:   return 0;
1414: }

1416: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1417: {
1418:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1419:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1420:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1421:   CsrMatrix                    *Acsr;
1422:   PetscInt                      m, nz;
1423:   PetscBool                     flg;

1425:   if (PetscDefined(USE_DEBUG)) {
1426:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1428:   }

1430:   /* Copy A's value to fact */
1431:   m  = fact->rmap->n;
1432:   nz = aij->nz;
1433:   MatSeqAIJCUSPARSECopyToGPU(A);
1434:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1435:   cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);

1437:   /* Factorize fact inplace */
1438:   if (m)
1439:     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1440:                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1441:   if (PetscDefined(USE_DEBUG)) {
1442:     int              numerical_zero;
1443:     cusparseStatus_t status;
1444:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1445:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1446:   }

1448:   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1449:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1450:   */
1451:   cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);

1453:   cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U);

1455:   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1456:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1458:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1459:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1460:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1461:   fact->ops->matsolve          = NULL;
1462:   fact->ops->matsolvetranspose = NULL;
1463:   PetscLogGpuFlops(fs->numericFactFlops);
1464:   return 0;
1465: }

1467: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1468: {
1469:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1470:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1471:   PetscInt                      m, nz;

1473:   if (PetscDefined(USE_DEBUG)) {
1474:     PetscInt  i;
1475:     PetscBool flg, missing;

1477:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1480:     MatMissingDiagonal(A, &missing, &i);
1482:   }

1484:   /* Free the old stale stuff */
1485:   MatSeqAIJCUSPARSETriFactors_Reset(&fs);

1487:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1488:      but they will not be used. Allocate them just for easy debugging.
1489:    */
1490:   MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);

1492:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1493:   fact->factortype             = MAT_FACTOR_ILU;
1494:   fact->info.factor_mallocs    = 0;
1495:   fact->info.fill_ratio_given  = info->fill;
1496:   fact->info.fill_ratio_needed = 1.0;

1498:   aij->row = NULL;
1499:   aij->col = NULL;

1501:   /* ====================================================================== */
1502:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1503:   /* We'll do in-place factorization on fact                                */
1504:   /* ====================================================================== */
1505:   const int *Ai, *Aj;

1507:   m  = fact->rmap->n;
1508:   nz = aij->nz;

1510:   cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1511:   cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1512:   cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1513:   MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1514:   cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1515:   cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);

1517:   /* ====================================================================== */
1518:   /* Create descriptors for M, L, U                                         */
1519:   /* ====================================================================== */
1520:   cusparseFillMode_t fillMode;
1521:   cusparseDiagType_t diagType;

1523:   cusparseCreateMatDescr(&fs->matDescr_M);
1524:   cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO);
1525:   cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL);

1527:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1528:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1529:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1530:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1531:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1532:   */
1533:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1534:   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1535:   cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1536:   cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1537:   cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1539:   fillMode = CUSPARSE_FILL_MODE_UPPER;
1540:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1541:   cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1542:   cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1543:   cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1545:   /* ========================================================================= */
1546:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1547:   /* ========================================================================= */
1548:   cusparseCreateCsrilu02Info(&fs->ilu0Info_M);
1549:   if (m)
1550:     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1551:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1553:   cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1554:   cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);

1556:   cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype);
1557:   cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype);

1559:   cusparseSpSV_createDescr(&fs->spsvDescr_L);
1560:   cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);

1562:   cusparseSpSV_createDescr(&fs->spsvDescr_U);
1563:   cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U);

1565:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1566:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1567:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1568:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1569:    */
1570:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1571:     cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1572:     fs->spsvBuffer_L = fs->factBuffer_M;
1573:     cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U);
1574:   } else {
1575:     cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M));
1576:     fs->spsvBuffer_U = fs->factBuffer_M;
1577:     cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1578:   }

1580:   /* ========================================================================== */
1581:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1582:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1583:   /* ========================================================================== */
1584:   int              structural_zero;
1585:   cusparseStatus_t status;

1587:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1588:   if (m)
1589:     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1590:                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1591:   if (PetscDefined(USE_DEBUG)) {
1592:     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1593:     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1595:   }

1597:   /* Estimate FLOPs of the numeric factorization */
1598:   {
1599:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1600:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1601:     PetscLogDouble flops = 0.0;

1603:     MatMarkDiagonal_SeqAIJ(A);
1604:     Ai    = Aseq->i;
1605:     Adiag = Aseq->diag;
1606:     for (PetscInt i = 0; i < m; i++) {
1607:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1608:         nzRow  = Ai[i + 1] - Ai[i];
1609:         nzLeft = Adiag[i] - Ai[i];
1610:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1611:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1612:         */
1613:         nzLeft = (nzRow - 1) / 2;
1614:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1615:       }
1616:     }
1617:     fs->numericFactFlops = flops;
1618:   }
1619:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1620:   return 0;
1621: }

1623: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1624: {
1625:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1626:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1627:   const PetscScalar            *barray;
1628:   PetscScalar                  *xarray;

1630:   VecCUDAGetArrayWrite(x, &xarray);
1631:   VecCUDAGetArrayRead(b, &barray);
1632:   PetscLogGpuTimeBegin();

1634:   /* Solve L*y = b */
1635:   cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1636:   cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1637:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1638:                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));

1640:   /* Solve Lt*x = y */
1641:   cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1642:   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1643:                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));

1645:   VecCUDARestoreArrayRead(b, &barray);
1646:   VecCUDARestoreArrayWrite(x, &xarray);

1648:   PetscLogGpuTimeEnd();
1649:   PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1650:   return 0;
1651: }

1653: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1654: {
1655:   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1656:   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1657:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1658:   CsrMatrix                    *Acsr;
1659:   PetscInt                      m, nz;
1660:   PetscBool                     flg;

1662:   if (PetscDefined(USE_DEBUG)) {
1663:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1665:   }

1667:   /* Copy A's value to fact */
1668:   m  = fact->rmap->n;
1669:   nz = aij->nz;
1670:   MatSeqAIJCUSPARSECopyToGPU(A);
1671:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1672:   cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);

1674:   /* Factorize fact inplace */
1675:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1676:      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1677:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1678:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1679:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1680:    */
1681:   if (m) cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1682:   if (PetscDefined(USE_DEBUG)) {
1683:     int              numerical_zero;
1684:     cusparseStatus_t status;
1685:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1686:     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1687:   }

1689:   cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);

1691:   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1692:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1693:   */
1694:   cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);

1696:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1697:   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1698:   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1699:   fact->ops->matsolve          = NULL;
1700:   fact->ops->matsolvetranspose = NULL;
1701:   PetscLogGpuFlops(fs->numericFactFlops);
1702:   return 0;
1703: }

1705: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1706: {
1707:   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1708:   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1709:   PetscInt                      m, nz;

1711:   if (PetscDefined(USE_DEBUG)) {
1712:     PetscInt  i;
1713:     PetscBool flg, missing;

1715:     PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1718:     MatMissingDiagonal(A, &missing, &i);
1720:   }

1722:   /* Free the old stale stuff */
1723:   MatSeqAIJCUSPARSETriFactors_Reset(&fs);

1725:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1726:      but they will not be used. Allocate them just for easy debugging.
1727:    */
1728:   MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);

1730:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1731:   fact->factortype             = MAT_FACTOR_ICC;
1732:   fact->info.factor_mallocs    = 0;
1733:   fact->info.fill_ratio_given  = info->fill;
1734:   fact->info.fill_ratio_needed = 1.0;

1736:   aij->row = NULL;
1737:   aij->col = NULL;

1739:   /* ====================================================================== */
1740:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1741:   /* We'll do in-place factorization on fact                                */
1742:   /* ====================================================================== */
1743:   const int *Ai, *Aj;

1745:   m  = fact->rmap->n;
1746:   nz = aij->nz;

1748:   cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1749:   cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1750:   cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1751:   MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1752:   cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1753:   cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);

1755:   /* ====================================================================== */
1756:   /* Create mat descriptors for M, L                                        */
1757:   /* ====================================================================== */
1758:   cusparseFillMode_t fillMode;
1759:   cusparseDiagType_t diagType;

1761:   cusparseCreateMatDescr(&fs->matDescr_M);
1762:   cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO);
1763:   cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL);

1765:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1766:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1767:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1768:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1769:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1770:   */
1771:   fillMode = CUSPARSE_FILL_MODE_LOWER;
1772:   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1773:   cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1774:   cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1775:   cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));

1777:   /* ========================================================================= */
1778:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1779:   /* ========================================================================= */
1780:   cusparseCreateCsric02Info(&fs->ic0Info_M);
1781:   if (m) cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M);

1783:   cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1784:   cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);

1786:   cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype);
1787:   cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype);

1789:   cusparseSpSV_createDescr(&fs->spsvDescr_L);
1790:   cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);

1792:   cusparseSpSV_createDescr(&fs->spsvDescr_Lt);
1793:   cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt);

1795:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1796:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1797:    */
1798:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1799:     cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1800:     fs->spsvBuffer_L = fs->factBuffer_M;
1801:     cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1802:   } else {
1803:     cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M));
1804:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1805:     cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1806:   }

1808:   /* ========================================================================== */
1809:   /* Perform analysis of ic0 on M                                               */
1810:   /* The lower triangular part of M has the same sparsity pattern as L          */
1811:   /* ========================================================================== */
1812:   int              structural_zero;
1813:   cusparseStatus_t status;

1815:   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1816:   if (m) cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1817:   if (PetscDefined(USE_DEBUG)) {
1818:     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1819:     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1821:   }

1823:   /* Estimate FLOPs of the numeric factorization */
1824:   {
1825:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1826:     PetscInt      *Ai, nzRow, nzLeft;
1827:     PetscLogDouble flops = 0.0;

1829:     Ai = Aseq->i;
1830:     for (PetscInt i = 0; i < m; i++) {
1831:       nzRow = Ai[i + 1] - Ai[i];
1832:       if (nzRow > 1) {
1833:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1834:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1835:         */
1836:         nzLeft = (nzRow - 1) / 2;
1837:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1838:       }
1839:     }
1840:     fs->numericFactFlops = flops;
1841:   }
1842:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1843:   return 0;
1844: }
1845: #endif

1847: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1848: {
1849:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1851: #if CUSPARSE_VERSION >= 11500
1852:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1853:   if (cusparseTriFactors->factorizeOnDevice) {
1854:     ISIdentity(isrow, &row_identity);
1855:     ISIdentity(iscol, &col_identity);
1856:   }
1857:   if (!info->levels && row_identity && col_identity) {
1858:     MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info);
1859:   } else
1860: #endif
1861:   {
1862:     MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1863:     MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1864:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1865:   }
1866:   return 0;
1867: }

1869: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1870: {
1871:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1873:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1874:   MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1875:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1876:   return 0;
1877: }

1879: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1880: {
1881:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1883: #if CUSPARSE_VERSION >= 11500
1884:   PetscBool perm_identity = PETSC_FALSE;
1885:   if (cusparseTriFactors->factorizeOnDevice) ISIdentity(perm, &perm_identity);
1886:   if (!info->levels && perm_identity) {
1887:     MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info);
1888:   } else
1889: #endif
1890:   {
1891:     MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1892:     MatICCFactorSymbolic_SeqAIJ(B, A, perm, info);
1893:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1894:   }
1895:   return 0;
1896: }

1898: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1899: {
1900:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;

1902:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1903:   MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info);
1904:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1905:   return 0;
1906: }

1908: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
1909: {
1910:   *type = MATSOLVERCUSPARSE;
1911:   return 0;
1912: }

1914: /*MC
1915:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1916:   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1917:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1918:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1919:   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1920:   algorithms are not recommended. This class does NOT support direct solver operations.

1922:   Level: beginner

1924: .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1925: M*/

1927: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1928: {
1929:   PetscInt  n = A->rmap->n;
1930:   PetscBool factOnDevice, factOnHost;
1931:   char     *prefix;
1932:   char      factPlace[32] = "device"; /* the default */

1934:   MatCreate(PetscObjectComm((PetscObject)A), B);
1935:   MatSetSizes(*B, n, n, n, n);
1936:   (*B)->factortype = ftype;
1937:   MatSetType(*B, MATSEQAIJCUSPARSE);

1939:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1940:   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1941:   PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL);
1942:   PetscOptionsEnd();
1943:   PetscStrcasecmp("device", factPlace, &factOnDevice);
1944:   PetscStrcasecmp("host", factPlace, &factOnHost);
1946:   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

1948:   if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B, PETSC_TRUE);
1949:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1950:     MatSetBlockSizesFromMats(*B, A, A);
1951:     if (!A->boundtocpu) {
1952:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1953:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1954:     } else {
1955:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1956:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1957:     }
1958:     PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1959:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
1960:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
1961:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1962:     if (!A->boundtocpu) {
1963:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1964:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1965:     } else {
1966:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1967:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1968:     }
1969:     PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
1970:     PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
1971:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");

1973:   MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1974:   (*B)->canuseordering = PETSC_TRUE;
1975:   PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse);
1976:   return 0;
1977: }

1979: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1980: {
1981:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
1982:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1983: #if CUSPARSE_VERSION >= 13500
1984:   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1985: #endif

1987:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1988:     PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0);
1989:     if (A->factortype == MAT_FACTOR_NONE) {
1990:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1991:       cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1992:     }
1993: #if CUSPARSE_VERSION >= 13500
1994:     else if (fs->csrVal) {
1995:       /* We have a factorized matrix on device and are able to copy it to host */
1996:       cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1997:     }
1998: #endif
1999:     else
2000:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2001:     PetscLogGpuToCpu(a->nz * sizeof(PetscScalar));
2002:     PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0);
2003:     A->offloadmask = PETSC_OFFLOAD_BOTH;
2004:   }
2005:   return 0;
2006: }

2008: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2009: {
2010:   MatSeqAIJCUSPARSECopyFromGPU(A);
2011:   *array = ((Mat_SeqAIJ *)A->data)->a;
2012:   return 0;
2013: }

2015: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2016: {
2017:   A->offloadmask = PETSC_OFFLOAD_CPU;
2018:   *array         = NULL;
2019:   return 0;
2020: }

2022: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2023: {
2024:   MatSeqAIJCUSPARSECopyFromGPU(A);
2025:   *array = ((Mat_SeqAIJ *)A->data)->a;
2026:   return 0;
2027: }

2029: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2030: {
2031:   *array = NULL;
2032:   return 0;
2033: }

2035: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2036: {
2037:   *array = ((Mat_SeqAIJ *)A->data)->a;
2038:   return 0;
2039: }

2041: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2042: {
2043:   A->offloadmask = PETSC_OFFLOAD_CPU;
2044:   *array         = NULL;
2045:   return 0;
2046: }

2048: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2049: {
2050:   Mat_SeqAIJCUSPARSE *cusp;
2051:   CsrMatrix          *matrix;

2053:   MatSeqAIJCUSPARSECopyToGPU(A);
2055:   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2057:   matrix = (CsrMatrix *)cusp->mat->mat;

2059:   if (i) {
2060: #if !defined(PETSC_USE_64BIT_INDICES)
2061:     *i = matrix->row_offsets->data().get();
2062: #else
2063:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2064: #endif
2065:   }
2066:   if (j) {
2067: #if !defined(PETSC_USE_64BIT_INDICES)
2068:     *j = matrix->column_indices->data().get();
2069: #else
2070:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2071: #endif
2072:   }
2073:   if (a) *a = matrix->values->data().get();
2074:   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2075:   return 0;
2076: }

2078: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2079: {
2080:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2081:   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
2082:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2083:   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2084:   cusparseStatus_t              stat;
2085:   PetscBool                     both = PETSC_TRUE;

2088:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2089:     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2090:       CsrMatrix *matrix;
2091:       matrix = (CsrMatrix *)cusparsestruct->mat->mat;

2094:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2095:       matrix->values->assign(a->a, a->a + a->nz);
2096:       WaitForCUDA();
2097:       PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar));
2098:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2099:       MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
2100:     } else {
2101:       PetscInt nnz;
2102:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2103:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format);
2104:       MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
2105:       delete cusparsestruct->workVector;
2106:       delete cusparsestruct->rowoffsets_gpu;
2107:       cusparsestruct->workVector     = NULL;
2108:       cusparsestruct->rowoffsets_gpu = NULL;
2109:       try {
2110:         if (a->compressedrow.use) {
2111:           m    = a->compressedrow.nrows;
2112:           ii   = a->compressedrow.i;
2113:           ridx = a->compressedrow.rindex;
2114:         } else {
2115:           m    = A->rmap->n;
2116:           ii   = a->i;
2117:           ridx = NULL;
2118:         }
2120:         if (!a->a) {
2121:           nnz  = ii[m];
2122:           both = PETSC_FALSE;
2123:         } else nnz = a->nz;

2126:         /* create cusparse matrix */
2127:         cusparsestruct->nrows = m;
2128:         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
2129:         cusparseCreateMatDescr(&matstruct->descr);
2130:         cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);
2131:         cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);

2133:         cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));
2134:         cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar));
2135:         cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
2136:         cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2137:         cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2138:         cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2139:         cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);

2141:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2142:         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2143:           /* set the matrix */
2144:           CsrMatrix *mat   = new CsrMatrix;
2145:           mat->num_rows    = m;
2146:           mat->num_cols    = A->cmap->n;
2147:           mat->num_entries = nnz;
2148:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2149:           mat->row_offsets->assign(ii, ii + m + 1);

2151:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2152:           mat->column_indices->assign(a->j, a->j + nnz);

2154:           mat->values = new THRUSTARRAY(nnz);
2155:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2157:           /* assign the pointer */
2158:           matstruct->mat = mat;
2159: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2160:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2161:             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2162:                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2163:             stat;
2164:           }
2165: #endif
2166:         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2167: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2168:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2169: #else
2170:           CsrMatrix *mat = new CsrMatrix;
2171:           mat->num_rows = m;
2172:           mat->num_cols = A->cmap->n;
2173:           mat->num_entries = nnz;
2174:           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2175:           mat->row_offsets->assign(ii, ii + m + 1);

2177:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2178:           mat->column_indices->assign(a->j, a->j + nnz);

2180:           mat->values = new THRUSTARRAY(nnz);
2181:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2183:           cusparseHybMat_t hybMat;
2184:           cusparseCreateHybMat(&hybMat);
2185:           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2186:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2187:           stat;
2188:           /* assign the pointer */
2189:           matstruct->mat = hybMat;

2191:           if (mat) {
2192:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2193:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2194:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2195:             delete (CsrMatrix *)mat;
2196:           }
2197: #endif
2198:         }

2200:         /* assign the compressed row indices */
2201:         if (a->compressedrow.use) {
2202:           cusparsestruct->workVector = new THRUSTARRAY(m);
2203:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2204:           matstruct->cprowIndices->assign(ridx, ridx + m);
2205:           tmp = m;
2206:         } else {
2207:           cusparsestruct->workVector = NULL;
2208:           matstruct->cprowIndices    = NULL;
2209:           tmp                        = 0;
2210:         }
2211:         PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar));

2213:         /* assign the pointer */
2214:         cusparsestruct->mat = matstruct;
2215:       } catch (char *ex) {
2216:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2217:       }
2218:       WaitForCUDA();
2219:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2220:       cusparsestruct->nonzerostate = A->nonzerostate;
2221:     }
2222:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2223:   }
2224:   return 0;
2225: }

2227: struct VecCUDAPlusEquals {
2228:   template <typename Tuple>
2229:   __host__ __device__ void operator()(Tuple t)
2230:   {
2231:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2232:   }
2233: };

2235: struct VecCUDAEquals {
2236:   template <typename Tuple>
2237:   __host__ __device__ void operator()(Tuple t)
2238:   {
2239:     thrust::get<1>(t) = thrust::get<0>(t);
2240:   }
2241: };

2243: struct VecCUDAEqualsReverse {
2244:   template <typename Tuple>
2245:   __host__ __device__ void operator()(Tuple t)
2246:   {
2247:     thrust::get<0>(t) = thrust::get<1>(t);
2248:   }
2249: };

2251: struct MatMatCusparse {
2252:   PetscBool      cisdense;
2253:   PetscScalar   *Bt;
2254:   Mat            X;
2255:   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2256:   PetscLogDouble flops;
2257:   CsrMatrix     *Bcsr;

2259: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2260:   cusparseSpMatDescr_t matSpBDescr;
2261:   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2262:   cusparseDnMatDescr_t matBDescr;
2263:   cusparseDnMatDescr_t matCDescr;
2264:   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2265:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2266:   void *dBuffer4;
2267:   void *dBuffer5;
2268:   #endif
2269:   size_t                mmBufferSize;
2270:   void                 *mmBuffer;
2271:   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2272:   cusparseSpGEMMDescr_t spgemmDesc;
2273: #endif
2274: };

2276: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2277: {
2278:   MatMatCusparse *mmdata = (MatMatCusparse *)data;

2280:   cudaFree(mmdata->Bt);
2281:   delete mmdata->Bcsr;
2282: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2283:   if (mmdata->matSpBDescr) cusparseDestroySpMat(mmdata->matSpBDescr);
2284:   if (mmdata->matBDescr) cusparseDestroyDnMat(mmdata->matBDescr);
2285:   if (mmdata->matCDescr) cusparseDestroyDnMat(mmdata->matCDescr);
2286:   if (mmdata->spgemmDesc) cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2287:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2288:   if (mmdata->dBuffer4) cudaFree(mmdata->dBuffer4);
2289:   if (mmdata->dBuffer5) cudaFree(mmdata->dBuffer5);
2290:   #endif
2291:   if (mmdata->mmBuffer) cudaFree(mmdata->mmBuffer);
2292:   if (mmdata->mmBuffer2) cudaFree(mmdata->mmBuffer2);
2293: #endif
2294:   MatDestroy(&mmdata->X);
2295:   PetscFree(data);
2296:   return 0;
2297: }

2299: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);

2301: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2302: {
2303:   Mat_Product                  *product = C->product;
2304:   Mat                           A, B;
2305:   PetscInt                      m, n, blda, clda;
2306:   PetscBool                     flg, biscuda;
2307:   Mat_SeqAIJCUSPARSE           *cusp;
2308:   cusparseStatus_t              stat;
2309:   cusparseOperation_t           opA;
2310:   const PetscScalar            *barray;
2311:   PetscScalar                  *carray;
2312:   MatMatCusparse               *mmdata;
2313:   Mat_SeqAIJCUSPARSEMultStruct *mat;
2314:   CsrMatrix                    *csrmat;

2316:   MatCheckProduct(C, 1);
2318:   mmdata = (MatMatCusparse *)product->data;
2319:   A      = product->A;
2320:   B      = product->B;
2321:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2323:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2324:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2326:   MatSeqAIJCUSPARSECopyToGPU(A);
2327:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2328:   switch (product->type) {
2329:   case MATPRODUCT_AB:
2330:   case MATPRODUCT_PtAP:
2331:     mat = cusp->mat;
2332:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2333:     m   = A->rmap->n;
2334:     n   = B->cmap->n;
2335:     break;
2336:   case MATPRODUCT_AtB:
2337:     if (!A->form_explicit_transpose) {
2338:       mat = cusp->mat;
2339:       opA = CUSPARSE_OPERATION_TRANSPOSE;
2340:     } else {
2341:       MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2342:       mat = cusp->matTranspose;
2343:       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2344:     }
2345:     m = A->cmap->n;
2346:     n = B->cmap->n;
2347:     break;
2348:   case MATPRODUCT_ABt:
2349:   case MATPRODUCT_RARt:
2350:     mat = cusp->mat;
2351:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2352:     m   = A->rmap->n;
2353:     n   = B->rmap->n;
2354:     break;
2355:   default:
2356:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2357:   }
2359:   csrmat = (CsrMatrix *)mat->mat;
2360:   /* if the user passed a CPU matrix, copy the data to the GPU */
2361:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda);
2362:   if (!biscuda) MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
2363:   MatDenseCUDAGetArrayRead(B, &barray);

2365:   MatDenseGetLDA(B, &blda);
2366:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2367:     MatDenseCUDAGetArrayWrite(mmdata->X, &carray);
2368:     MatDenseGetLDA(mmdata->X, &clda);
2369:   } else {
2370:     MatDenseCUDAGetArrayWrite(C, &carray);
2371:     MatDenseGetLDA(C, &clda);
2372:   }

2374:   PetscLogGpuTimeBegin();
2375: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2376:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2377:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2378:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2379:     size_t mmBufferSize;
2380:     if (mmdata->initialized && mmdata->Blda != blda) {
2381:       cusparseDestroyDnMat(mmdata->matBDescr);
2382:       mmdata->matBDescr = NULL;
2383:     }
2384:     if (!mmdata->matBDescr) {
2385:       cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL);
2386:       mmdata->Blda = blda;
2387:     }

2389:     if (mmdata->initialized && mmdata->Clda != clda) {
2390:       cusparseDestroyDnMat(mmdata->matCDescr);
2391:       mmdata->matCDescr = NULL;
2392:     }
2393:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2394:       cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL);
2395:       mmdata->Clda = clda;
2396:     }

2398:     if (!mat->matDescr) {
2399:       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2400:                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2401:       stat;
2402:     }
2403:     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2404:     stat;
2405:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2406:       cudaFree(mmdata->mmBuffer);
2407:       cudaMalloc(&mmdata->mmBuffer, mmBufferSize);
2408:       mmdata->mmBufferSize = mmBufferSize;
2409:     }
2410:     mmdata->initialized = PETSC_TRUE;
2411:   } else {
2412:     /* to be safe, always update pointers of the mats */
2413:     cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get());
2414:     cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray);
2415:     cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray);
2416:   }

2418:   /* do cusparseSpMM, which supports transpose on B */
2419:   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2420:   stat;
2421: #else
2422:   PetscInt k;
2423:   /* cusparseXcsrmm does not support transpose on B */
2424:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2425:     cublasHandle_t cublasv2handle;
2426:     cublasStatus_t cerr;

2428:     PetscCUBLASGetHandle(&cublasv2handle);
2429:     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2430:     cerr;
2431:     blda = B->cmap->n;
2432:     k = B->cmap->n;
2433:   } else {
2434:     k = B->rmap->n;
2435:   }

2437:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2438:   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2439:   stat;
2440: #endif
2441:   PetscLogGpuTimeEnd();
2442:   PetscLogGpuFlops(n * 2.0 * csrmat->num_entries);
2443:   MatDenseCUDARestoreArrayRead(B, &barray);
2444:   if (product->type == MATPRODUCT_RARt) {
2445:     MatDenseCUDARestoreArrayWrite(mmdata->X, &carray);
2446:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE);
2447:   } else if (product->type == MATPRODUCT_PtAP) {
2448:     MatDenseCUDARestoreArrayWrite(mmdata->X, &carray);
2449:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE);
2450:   } else {
2451:     MatDenseCUDARestoreArrayWrite(C, &carray);
2452:   }
2453:   if (mmdata->cisdense) MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C);
2454:   if (!biscuda) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
2455:   return 0;
2456: }

2458: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2459: {
2460:   Mat_Product        *product = C->product;
2461:   Mat                 A, B;
2462:   PetscInt            m, n;
2463:   PetscBool           cisdense, flg;
2464:   MatMatCusparse     *mmdata;
2465:   Mat_SeqAIJCUSPARSE *cusp;

2467:   MatCheckProduct(C, 1);
2469:   A = product->A;
2470:   B = product->B;
2471:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2473:   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2475:   switch (product->type) {
2476:   case MATPRODUCT_AB:
2477:     m = A->rmap->n;
2478:     n = B->cmap->n;
2479:     break;
2480:   case MATPRODUCT_AtB:
2481:     m = A->cmap->n;
2482:     n = B->cmap->n;
2483:     break;
2484:   case MATPRODUCT_ABt:
2485:     m = A->rmap->n;
2486:     n = B->rmap->n;
2487:     break;
2488:   case MATPRODUCT_PtAP:
2489:     m = B->cmap->n;
2490:     n = B->cmap->n;
2491:     break;
2492:   case MATPRODUCT_RARt:
2493:     m = B->rmap->n;
2494:     n = B->rmap->n;
2495:     break;
2496:   default:
2497:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2498:   }
2499:   MatSetSizes(C, m, n, m, n);
2500:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2501:   PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense);
2502:   MatSetType(C, MATSEQDENSECUDA);

2504:   /* product data */
2505:   PetscNew(&mmdata);
2506:   mmdata->cisdense = cisdense;
2507: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2508:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2509:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar));
2510: #endif
2511:   /* for these products we need intermediate storage */
2512:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2513:     MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X);
2514:     MatSetType(mmdata->X, MATSEQDENSECUDA);
2515:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2516:       MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n);
2517:     } else {
2518:       MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n);
2519:     }
2520:   }
2521:   C->product->data    = mmdata;
2522:   C->product->destroy = MatDestroy_MatMatCusparse;

2524:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2525:   return 0;
2526: }

2528: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2529: {
2530:   Mat_Product                  *product = C->product;
2531:   Mat                           A, B;
2532:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2533:   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2534:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2535:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2536:   PetscBool                     flg;
2537:   cusparseStatus_t              stat;
2538:   MatProductType                ptype;
2539:   MatMatCusparse               *mmdata;
2540: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2541:   cusparseSpMatDescr_t BmatSpDescr;
2542: #endif
2543:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2545:   MatCheckProduct(C, 1);
2547:   PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg);
2549:   mmdata = (MatMatCusparse *)C->product->data;
2550:   A      = product->A;
2551:   B      = product->B;
2552:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2553:     mmdata->reusesym = PETSC_FALSE;
2554:     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
2556:     Cmat = Ccusp->mat;
2558:     Ccsr = (CsrMatrix *)Cmat->mat;
2560:     goto finalize;
2561:   }
2562:   if (!c->nz) goto finalize;
2563:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2565:   PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg);
2569:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2570:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2571:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2575:   MatSeqAIJCUSPARSECopyToGPU(A);
2576:   MatSeqAIJCUSPARSECopyToGPU(B);

2578:   ptype = product->type;
2579:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2580:     ptype = MATPRODUCT_AB;
2582:   }
2583:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2584:     ptype = MATPRODUCT_AB;
2586:   }
2587:   switch (ptype) {
2588:   case MATPRODUCT_AB:
2589:     Amat = Acusp->mat;
2590:     Bmat = Bcusp->mat;
2591:     break;
2592:   case MATPRODUCT_AtB:
2593:     Amat = Acusp->matTranspose;
2594:     Bmat = Bcusp->mat;
2595:     break;
2596:   case MATPRODUCT_ABt:
2597:     Amat = Acusp->mat;
2598:     Bmat = Bcusp->matTranspose;
2599:     break;
2600:   default:
2601:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2602:   }
2603:   Cmat = Ccusp->mat;
2607:   Acsr = (CsrMatrix *)Amat->mat;
2608:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2609:   Ccsr = (CsrMatrix *)Cmat->mat;
2613:   PetscLogGpuTimeBegin();
2614: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2615:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2616:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2617:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2618:   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2619:   stat;
2620:   #else
2621:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2622:   stat;
2623:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2624:   stat;
2625:   #endif
2626: #else
2627:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2628:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2629:   stat;
2630: #endif
2631:   PetscLogGpuFlops(mmdata->flops);
2632:   WaitForCUDA();
2633:   PetscLogGpuTimeEnd();
2634:   C->offloadmask = PETSC_OFFLOAD_GPU;
2635: finalize:
2636:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2637:   PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
2638:   PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
2639:   PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
2640:   c->reallocs = 0;
2641:   C->info.mallocs += 0;
2642:   C->info.nz_unneeded = 0;
2643:   C->assembled = C->was_assembled = PETSC_TRUE;
2644:   C->num_ass++;
2645:   return 0;
2646: }

2648: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2649: {
2650:   Mat_Product                  *product = C->product;
2651:   Mat                           A, B;
2652:   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2653:   Mat_SeqAIJ                   *a, *b, *c;
2654:   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2655:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2656:   PetscInt                      i, j, m, n, k;
2657:   PetscBool                     flg;
2658:   cusparseStatus_t              stat;
2659:   MatProductType                ptype;
2660:   MatMatCusparse               *mmdata;
2661:   PetscLogDouble                flops;
2662:   PetscBool                     biscompressed, ciscompressed;
2663: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2664:   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2665:   cusparseSpMatDescr_t BmatSpDescr;
2666: #else
2667:   int cnz;
2668: #endif
2669:   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */

2671:   MatCheckProduct(C, 1);
2673:   A = product->A;
2674:   B = product->B;
2675:   PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2677:   PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg);
2679:   a = (Mat_SeqAIJ *)A->data;
2680:   b = (Mat_SeqAIJ *)B->data;
2681:   /* product data */
2682:   PetscNew(&mmdata);
2683:   C->product->data    = mmdata;
2684:   C->product->destroy = MatDestroy_MatMatCusparse;

2686:   MatSeqAIJCUSPARSECopyToGPU(A);
2687:   MatSeqAIJCUSPARSECopyToGPU(B);
2688:   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2689:   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;

2693:   ptype = product->type;
2694:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2695:     ptype                                          = MATPRODUCT_AB;
2696:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2697:   }
2698:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2699:     ptype                                          = MATPRODUCT_AB;
2700:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2701:   }
2702:   biscompressed = PETSC_FALSE;
2703:   ciscompressed = PETSC_FALSE;
2704:   switch (ptype) {
2705:   case MATPRODUCT_AB:
2706:     m    = A->rmap->n;
2707:     n    = B->cmap->n;
2708:     k    = A->cmap->n;
2709:     Amat = Acusp->mat;
2710:     Bmat = Bcusp->mat;
2711:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2712:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2713:     break;
2714:   case MATPRODUCT_AtB:
2715:     m = A->cmap->n;
2716:     n = B->cmap->n;
2717:     k = A->rmap->n;
2718:     MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2719:     Amat = Acusp->matTranspose;
2720:     Bmat = Bcusp->mat;
2721:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2722:     break;
2723:   case MATPRODUCT_ABt:
2724:     m = A->rmap->n;
2725:     n = B->rmap->n;
2726:     k = A->cmap->n;
2727:     MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2728:     Amat = Acusp->mat;
2729:     Bmat = Bcusp->matTranspose;
2730:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2731:     break;
2732:   default:
2733:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2734:   }

2736:   /* create cusparse matrix */
2737:   MatSetSizes(C, m, n, m, n);
2738:   MatSetType(C, MATSEQAIJCUSPARSE);
2739:   c     = (Mat_SeqAIJ *)C->data;
2740:   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2741:   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2742:   Ccsr  = new CsrMatrix;

2744:   c->compressedrow.use = ciscompressed;
2745:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2746:     c->compressedrow.nrows = a->compressedrow.nrows;
2747:     PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex);
2748:     PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows);
2749:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2750:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2751:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2752:   } else {
2753:     c->compressedrow.nrows  = 0;
2754:     c->compressedrow.i      = NULL;
2755:     c->compressedrow.rindex = NULL;
2756:     Ccusp->workVector       = NULL;
2757:     Cmat->cprowIndices      = NULL;
2758:   }
2759:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2760:   Ccusp->mat        = Cmat;
2761:   Ccusp->mat->mat   = Ccsr;
2762:   Ccsr->num_rows    = Ccusp->nrows;
2763:   Ccsr->num_cols    = n;
2764:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2765:   cusparseCreateMatDescr(&Cmat->descr);
2766:   cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
2767:   cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
2768:   cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
2769:   cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
2770:   cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2771:   cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2772:   cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2773:   cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2774:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2775:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2776:     c->nz                = 0;
2777:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2778:     Ccsr->values         = new THRUSTARRAY(c->nz);
2779:     goto finalizesym;
2780:   }

2784:   Acsr = (CsrMatrix *)Amat->mat;
2785:   if (!biscompressed) {
2786:     Bcsr = (CsrMatrix *)Bmat->mat;
2787: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2788:     BmatSpDescr = Bmat->matDescr;
2789: #endif
2790:   } else { /* we need to use row offsets for the full matrix */
2791:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2792:     Bcsr                 = new CsrMatrix;
2793:     Bcsr->num_rows       = B->rmap->n;
2794:     Bcsr->num_cols       = cBcsr->num_cols;
2795:     Bcsr->num_entries    = cBcsr->num_entries;
2796:     Bcsr->column_indices = cBcsr->column_indices;
2797:     Bcsr->values         = cBcsr->values;
2798:     if (!Bcusp->rowoffsets_gpu) {
2799:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2800:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2801:       PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
2802:     }
2803:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2804:     mmdata->Bcsr      = Bcsr;
2805: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2806:     if (Bcsr->num_rows && Bcsr->num_cols) {
2807:       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2808:       stat;
2809:     }
2810:     BmatSpDescr = mmdata->matSpBDescr;
2811: #endif
2812:   }
2815:   /* precompute flops count */
2816:   if (ptype == MATPRODUCT_AB) {
2817:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2818:       const PetscInt st = a->i[i];
2819:       const PetscInt en = a->i[i + 1];
2820:       for (j = st; j < en; j++) {
2821:         const PetscInt brow = a->j[j];
2822:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2823:       }
2824:     }
2825:   } else if (ptype == MATPRODUCT_AtB) {
2826:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2827:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2828:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2829:       flops += (2. * anzi) * bnzi;
2830:     }
2831:   } else { /* TODO */
2832:     flops = 0.;
2833:   }

2835:   mmdata->flops = flops;
2836:   PetscLogGpuTimeBegin();

2838: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2839:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2840:   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2841:   stat;
2842:   cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2843:   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2844:   {
2845:     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2846:      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2847:   */
2848:     void *dBuffer1 = NULL;
2849:     void *dBuffer2 = NULL;
2850:     void *dBuffer3 = NULL;
2851:     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2852:     size_t bufferSize1 = 0;
2853:     size_t bufferSize2 = 0;
2854:     size_t bufferSize3 = 0;
2855:     size_t bufferSize4 = 0;
2856:     size_t bufferSize5 = 0;

2858:     /*----------------------------------------------------------------------*/
2859:     /* ask bufferSize1 bytes for external memory */
2860:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2861:     stat;
2862:     cudaMalloc((void **)&dBuffer1, bufferSize1);
2863:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2864:     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2865:     stat;

2867:     /*----------------------------------------------------------------------*/
2868:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2869:     stat;
2870:     cudaMalloc((void **)&dBuffer2, bufferSize2);
2871:     cudaMalloc((void **)&dBuffer3, bufferSize3);
2872:     cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4);
2873:     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2874:     stat;
2875:     cudaFree(dBuffer1);
2876:     cudaFree(dBuffer2);

2878:     /*----------------------------------------------------------------------*/
2879:     /* get matrix C non-zero entries C_nnz1 */
2880:     cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2881:     c->nz = (PetscInt)C_nnz1;
2882:     /* allocate matrix C */
2883:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2884:     cudaPeekAtLastError(); /* catch out of memory errors */
2885:     Ccsr->values = new THRUSTARRAY(c->nz);
2886:     cudaPeekAtLastError(); /* catch out of memory errors */
2887:     /* update matC with the new pointers */
2888:     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2889:     stat;

2891:     /*----------------------------------------------------------------------*/
2892:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2893:     stat;
2894:     cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5);
2895:     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2896:     stat;
2897:     cudaFree(dBuffer3);
2898:     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2899:     stat;
2900:     PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024);
2901:   }
2902:   #else
2903:   size_t bufSize2;
2904:   /* ask bufferSize bytes for external memory */
2905:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2906:   stat;
2907:   cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2);
2908:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2909:   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2910:   stat;
2911:   /* ask bufferSize again bytes for external memory */
2912:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2913:   stat;
2914:   /* The CUSPARSE documentation is not clear, nor the API
2915:      We need both buffers to perform the operations properly!
2916:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2917:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2918:      is stored in the descriptor! What a messy API... */
2919:   cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize);
2920:   /* compute the intermediate product of A * B */
2921:   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2922:   stat;
2923:   /* get matrix C non-zero entries C_nnz1 */
2924:   cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2925:   c->nz = (PetscInt)C_nnz1;
2926:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2927:                       mmdata->mmBufferSize / 1024));
2928:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2929:   cudaPeekAtLastError(); /* catch out of memory errors */
2930:   Ccsr->values = new THRUSTARRAY(c->nz);
2931:   cudaPeekAtLastError(); /* catch out of memory errors */
2932:   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2933:   stat;
2934:   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2935:   stat;
2936:   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2937: #else
2938:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);
2939:   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2940:                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2941:   stat;
2942:   c->nz = cnz;
2943:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2944:   cudaPeekAtLastError(); /* catch out of memory errors */
2945:   Ccsr->values = new THRUSTARRAY(c->nz);
2946:   cudaPeekAtLastError(); /* catch out of memory errors */

2948:   cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2949:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2950:      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2951:      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2952:   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2953:                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2954:   stat;
2955: #endif
2956:   PetscLogGpuFlops(mmdata->flops);
2957:   PetscLogGpuTimeEnd();
2958: finalizesym:
2959:   c->singlemalloc = PETSC_FALSE;
2960:   c->free_a       = PETSC_TRUE;
2961:   c->free_ij      = PETSC_TRUE;
2962:   PetscMalloc1(m + 1, &c->i);
2963:   PetscMalloc1(c->nz, &c->j);
2964:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2965:     PetscInt      *d_i = c->i;
2966:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2967:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2968:     ii = *Ccsr->row_offsets;
2969:     jj = *Ccsr->column_indices;
2970:     if (ciscompressed) d_i = c->compressedrow.i;
2971:     cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2972:     cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2973:   } else {
2974:     PetscInt *d_i = c->i;
2975:     if (ciscompressed) d_i = c->compressedrow.i;
2976:     cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2977:     cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2978:   }
2979:   if (ciscompressed) { /* need to expand host row offsets */
2980:     PetscInt r = 0;
2981:     c->i[0]    = 0;
2982:     for (k = 0; k < c->compressedrow.nrows; k++) {
2983:       const PetscInt next = c->compressedrow.rindex[k];
2984:       const PetscInt old  = c->compressedrow.i[k];
2985:       for (; r < next; r++) c->i[r + 1] = old;
2986:     }
2987:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2988:   }
2989:   PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
2990:   PetscMalloc1(m, &c->ilen);
2991:   PetscMalloc1(m, &c->imax);
2992:   c->maxnz         = c->nz;
2993:   c->nonzerorowcnt = 0;
2994:   c->rmax          = 0;
2995:   for (k = 0; k < m; k++) {
2996:     const PetscInt nn = c->i[k + 1] - c->i[k];
2997:     c->ilen[k] = c->imax[k] = nn;
2998:     c->nonzerorowcnt += (PetscInt) !!nn;
2999:     c->rmax = PetscMax(c->rmax, nn);
3000:   }
3001:   MatMarkDiagonal_SeqAIJ(C);
3002:   PetscMalloc1(c->nz, &c->a);
3003:   Ccsr->num_entries = c->nz;

3005:   C->nonzerostate++;
3006:   PetscLayoutSetUp(C->rmap);
3007:   PetscLayoutSetUp(C->cmap);
3008:   Ccusp->nonzerostate = C->nonzerostate;
3009:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3010:   C->preallocated     = PETSC_TRUE;
3011:   C->assembled        = PETSC_FALSE;
3012:   C->was_assembled    = PETSC_FALSE;
3013:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3014:     mmdata->reusesym = PETSC_TRUE;
3015:     C->offloadmask   = PETSC_OFFLOAD_GPU;
3016:   }
3017:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3018:   return 0;
3019: }

3021: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

3023: /* handles sparse or dense B */
3024: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3025: {
3026:   Mat_Product *product = mat->product;
3027:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

3029:   MatCheckProduct(mat, 1);
3030:   PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense);
3031:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp);
3032:   if (product->type == MATPRODUCT_ABC) {
3033:     Ciscusp = PETSC_FALSE;
3034:     if (!product->C->boundtocpu) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp);
3035:   }
3036:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3037:     PetscBool usecpu = PETSC_FALSE;
3038:     switch (product->type) {
3039:     case MATPRODUCT_AB:
3040:       if (product->api_user) {
3041:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3042:         PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
3043:         PetscOptionsEnd();
3044:       } else {
3045:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3046:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
3047:         PetscOptionsEnd();
3048:       }
3049:       break;
3050:     case MATPRODUCT_AtB:
3051:       if (product->api_user) {
3052:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3053:         PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
3054:         PetscOptionsEnd();
3055:       } else {
3056:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3057:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
3058:         PetscOptionsEnd();
3059:       }
3060:       break;
3061:     case MATPRODUCT_PtAP:
3062:       if (product->api_user) {
3063:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3064:         PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
3065:         PetscOptionsEnd();
3066:       } else {
3067:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3068:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
3069:         PetscOptionsEnd();
3070:       }
3071:       break;
3072:     case MATPRODUCT_RARt:
3073:       if (product->api_user) {
3074:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3075:         PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
3076:         PetscOptionsEnd();
3077:       } else {
3078:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3079:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
3080:         PetscOptionsEnd();
3081:       }
3082:       break;
3083:     case MATPRODUCT_ABC:
3084:       if (product->api_user) {
3085:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3086:         PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
3087:         PetscOptionsEnd();
3088:       } else {
3089:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3090:         PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
3091:         PetscOptionsEnd();
3092:       }
3093:       break;
3094:     default:
3095:       break;
3096:     }
3097:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3098:   }
3099:   /* dispatch */
3100:   if (isdense) {
3101:     switch (product->type) {
3102:     case MATPRODUCT_AB:
3103:     case MATPRODUCT_AtB:
3104:     case MATPRODUCT_ABt:
3105:     case MATPRODUCT_PtAP:
3106:     case MATPRODUCT_RARt:
3107:       if (product->A->boundtocpu) {
3108:         MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
3109:       } else {
3110:         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3111:       }
3112:       break;
3113:     case MATPRODUCT_ABC:
3114:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3115:       break;
3116:     default:
3117:       break;
3118:     }
3119:   } else if (Biscusp && Ciscusp) {
3120:     switch (product->type) {
3121:     case MATPRODUCT_AB:
3122:     case MATPRODUCT_AtB:
3123:     case MATPRODUCT_ABt:
3124:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3125:       break;
3126:     case MATPRODUCT_PtAP:
3127:     case MATPRODUCT_RARt:
3128:     case MATPRODUCT_ABC:
3129:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3130:       break;
3131:     default:
3132:       break;
3133:     }
3134:   } else { /* fallback for AIJ */
3135:     MatProductSetFromOptions_SeqAIJ(mat);
3136:   }
3137:   return 0;
3138: }

3140: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3141: {
3142:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE);
3143:   return 0;
3144: }

3146: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3147: {
3148:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE);
3149:   return 0;
3150: }

3152: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3153: {
3154:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE);
3155:   return 0;
3156: }

3158: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3159: {
3160:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE);
3161:   return 0;
3162: }

3164: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3165: {
3166:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE);
3167:   return 0;
3168: }

3170: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3171: {
3172:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3173:   if (i < n) y[idx[i]] += x[i];
3174: }

3176: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3177: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3178: {
3179:   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3180:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3181:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3182:   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3183:   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3184:   PetscBool                     compressed;
3185: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3186:   PetscInt nx, ny;
3187: #endif

3190:   if (!a->nz) {
3191:     if (!yy) VecSet_SeqCUDA(zz, 0);
3192:     else VecCopy_SeqCUDA(yy, zz);
3193:     return 0;
3194:   }
3195:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3196:   MatSeqAIJCUSPARSECopyToGPU(A);
3197:   if (!trans) {
3198:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3200:   } else {
3201:     if (herm || !A->form_explicit_transpose) {
3202:       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3203:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3204:     } else {
3205:       if (!cusparsestruct->matTranspose) MatSeqAIJCUSPARSEFormExplicitTranspose(A);
3206:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3207:     }
3208:   }
3209:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3210:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

3212:   try {
3213:     VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray);
3214:     if (yy == zz) VecCUDAGetArray(zz, &zarray); /* read & write zz, so need to get uptodate zarray on GPU */
3215:     else VecCUDAGetArrayWrite(zz, &zarray);     /* write zz, so no need to init zarray on GPU */

3217:     PetscLogGpuTimeBegin();
3218:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3219:       /* z = A x + beta y.
3220:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3221:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3222:       */
3223:       xptr = xarray;
3224:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3225:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3226: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3227:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3228:           allocated to accommodate different uses. So we get the length info directly from mat.
3229:        */
3230:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3231:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3232:         nx             = mat->num_cols;
3233:         ny             = mat->num_rows;
3234:       }
3235: #endif
3236:     } else {
3237:       /* z = A^T x + beta y
3238:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3239:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3240:        */
3241:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3242:       dptr = zarray;
3243:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3244:       if (compressed) { /* Scatter x to work vector */
3245:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);

3247:         thrust::for_each(
3248: #if PetscDefined(HAVE_THRUST_ASYNC)
3249:           thrust::cuda::par.on(PetscDefaultCudaStream),
3250: #endif
3251:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3252:           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3253:       }
3254: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3255:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3256:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3257:         nx             = mat->num_rows;
3258:         ny             = mat->num_cols;
3259:       }
3260: #endif
3261:     }

3263:     /* csr_spmv does y = alpha op(A) x + beta y */
3264:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3265: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3267:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3268:         cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype);
3269:         cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype);
3270:         PetscCallCUSPARSE(
3271:           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3272:         cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize);

3274:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3275:       } else {
3276:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3277:         cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr);
3278:         cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr);
3279:       }

3281:       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3282:                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3283: #else
3284:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3285:       cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr);
3286: #endif
3287:     } else {
3288:       if (cusparsestruct->nrows) {
3289: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3290:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3291: #else
3292:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3293:         cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr);
3294: #endif
3295:       }
3296:     }
3297:     PetscLogGpuTimeEnd();

3299:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3300:       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3301:         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3302:           VecCopy_SeqCUDA(yy, zz);      /* zz = yy */
3303:         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3304:           VecAXPY_SeqCUDA(zz, 1.0, yy); /* zz += yy */
3305:         }
3306:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3307:         VecSet_SeqCUDA(zz, 0);
3308:       }

3310:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3311:       if (compressed) {
3312:         PetscLogGpuTimeBegin();
3313:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3314:            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3315:            prevent that. So I just add a ScatterAdd kernel.
3316:          */
3317: #if 0
3318:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3319:         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3320:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3321:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3322:                          VecCUDAPlusEquals());
3323: #else
3324:         PetscInt n = matstruct->cprowIndices->size();
3325:         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3326: #endif
3327:         PetscLogGpuTimeEnd();
3328:       }
3329:     } else {
3330:       if (yy && yy != zz) { VecAXPY_SeqCUDA(zz, 1.0, yy); /* zz += yy */ }
3331:     }
3332:     VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray);
3333:     if (yy == zz) VecCUDARestoreArray(zz, &zarray);
3334:     else VecCUDARestoreArrayWrite(zz, &zarray);
3335:   } catch (char *ex) {
3336:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3337:   }
3338:   if (yy) {
3339:     PetscLogGpuFlops(2.0 * a->nz);
3340:   } else {
3341:     PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt);
3342:   }
3343:   return 0;
3344: }

3346: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3347: {
3348:   MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE);
3349:   return 0;
3350: }

3352: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3353: {
3354:   PetscObjectState    onnz = A->nonzerostate;
3355:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

3357:   MatAssemblyEnd_SeqAIJ(A, mode);
3358:   if (onnz != A->nonzerostate && cusp->deviceMat) {
3359:     PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
3360:     cudaFree(cusp->deviceMat);
3361:     cusp->deviceMat = NULL;
3362:   }
3363:   return 0;
3364: }

3366: /* --------------------------------------------------------------------------------*/
3367: /*@
3368:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3369:    (the default parallel PETSc format). This matrix will ultimately pushed down
3370:    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3371:    assembly performance the user should preallocate the matrix storage by setting
3372:    the parameter nz (or the array nnz).  By setting these parameters accurately,
3373:    performance during matrix assembly can be increased by more than a factor of 50.

3375:    Collective

3377:    Input Parameters:
3378: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3379: .  m - number of rows
3380: .  n - number of columns
3381: .  nz - number of nonzeros per row (same for all rows)
3382: -  nnz - array containing the number of nonzeros in the various rows
3383:          (possibly different for each row) or NULL

3385:    Output Parameter:
3386: .  A - the matrix

3388:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3389:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3390:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

3392:    Notes:
3393:    If nnz is given then nz is ignored

3395:    The AIJ format, also called
3396:    compressed row storage, is fully compatible with standard Fortran 77
3397:    storage.  That is, the stored row and column indices can begin at
3398:    either one (as in Fortran) or zero.  See the users' manual for details.

3400:    Specify the preallocated storage with either nz or nnz (not both).
3401:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3402:    allocation.  For large problems you MUST preallocate memory or you
3403:    will get TERRIBLE performance, see the users' manual chapter on matrices.

3405:    By default, this format uses inodes (identical nodes) when possible, to
3406:    improve numerical efficiency of matrix-vector products and solves. We
3407:    search for consecutive rows with the same nonzero structure, thereby
3408:    reusing matrix information to achieve increased efficiency.

3410:    Level: intermediate

3412: .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3413: @*/
3414: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3415: {
3416:   MatCreate(comm, A);
3417:   MatSetSizes(*A, m, n, m, n);
3418:   MatSetType(*A, MATSEQAIJCUSPARSE);
3419:   MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
3420:   return 0;
3421: }

3423: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3424: {
3425:   if (A->factortype == MAT_FACTOR_NONE) {
3426:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr);
3427:   } else {
3428:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr);
3429:   }
3430:   PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3431:   PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL);
3432:   PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL);
3433:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL);
3434:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL);
3435:   PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL);
3436:   PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
3437:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3438:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3439:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL);
3440:   MatDestroy_SeqAIJ(A);
3441:   return 0;
3442: }

3444: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3445: static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3446: static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3447: {
3448:   MatDuplicate_SeqAIJ(A, cpvalues, B);
3449:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B);
3450:   return 0;
3451: }

3453: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3454: {
3455:   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3456:   Mat_SeqAIJCUSPARSE *cy;
3457:   Mat_SeqAIJCUSPARSE *cx;
3458:   PetscScalar        *ay;
3459:   const PetscScalar  *ax;
3460:   CsrMatrix          *csry, *csrx;

3462:   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3463:   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3464:   if (X->ops->axpy != Y->ops->axpy) {
3465:     MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3466:     MatAXPY_SeqAIJ(Y, a, X, str);
3467:     return 0;
3468:   }
3469:   /* if we are here, it means both matrices are bound to GPU */
3470:   MatSeqAIJCUSPARSECopyToGPU(Y);
3471:   MatSeqAIJCUSPARSECopyToGPU(X);
3474:   csry = (CsrMatrix *)cy->mat->mat;
3475:   csrx = (CsrMatrix *)cx->mat->mat;
3476:   /* see if we can turn this into a cublas axpy */
3477:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3478:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3479:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3480:     if (eq) str = SAME_NONZERO_PATTERN;
3481:   }
3482:   /* spgeam is buggy with one column */
3483:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

3485:   if (str == SUBSET_NONZERO_PATTERN) {
3486:     PetscScalar b = 1.0;
3487: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3488:     size_t bufferSize;
3489:     void  *buffer;
3490: #endif

3492:     MatSeqAIJCUSPARSEGetArrayRead(X, &ax);
3493:     MatSeqAIJCUSPARSEGetArray(Y, &ay);
3494:     cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);
3495: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3496:     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3497:                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3498:     cudaMalloc(&buffer, bufferSize);
3499:     PetscLogGpuTimeBegin();
3500:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3501:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3502:     PetscLogGpuFlops(x->nz + y->nz);
3503:     PetscLogGpuTimeEnd();
3504:     cudaFree(buffer);
3505: #else
3506:     PetscLogGpuTimeBegin();
3507:     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3508:                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3509:     PetscLogGpuFlops(x->nz + y->nz);
3510:     PetscLogGpuTimeEnd();
3511: #endif
3512:     cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);
3513:     MatSeqAIJCUSPARSERestoreArrayRead(X, &ax);
3514:     MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3515:     MatSeqAIJInvalidateDiagonal(Y);
3516:   } else if (str == SAME_NONZERO_PATTERN) {
3517:     cublasHandle_t cublasv2handle;
3518:     PetscBLASInt   one = 1, bnz = 1;

3520:     MatSeqAIJCUSPARSEGetArrayRead(X, &ax);
3521:     MatSeqAIJCUSPARSEGetArray(Y, &ay);
3522:     PetscCUBLASGetHandle(&cublasv2handle);
3523:     PetscBLASIntCast(x->nz, &bnz);
3524:     PetscLogGpuTimeBegin();
3525:     cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one);
3526:     PetscLogGpuFlops(2.0 * bnz);
3527:     PetscLogGpuTimeEnd();
3528:     MatSeqAIJCUSPARSERestoreArrayRead(X, &ax);
3529:     MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3530:     MatSeqAIJInvalidateDiagonal(Y);
3531:   } else {
3532:     MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3533:     MatAXPY_SeqAIJ(Y, a, X, str);
3534:   }
3535:   return 0;
3536: }

3538: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3539: {
3540:   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
3541:   PetscScalar   *ay;
3542:   cublasHandle_t cublasv2handle;
3543:   PetscBLASInt   one = 1, bnz = 1;

3545:   MatSeqAIJCUSPARSEGetArray(Y, &ay);
3546:   PetscCUBLASGetHandle(&cublasv2handle);
3547:   PetscBLASIntCast(y->nz, &bnz);
3548:   PetscLogGpuTimeBegin();
3549:   cublasXscal(cublasv2handle, bnz, &a, ay, one);
3550:   PetscLogGpuFlops(bnz);
3551:   PetscLogGpuTimeEnd();
3552:   MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3553:   MatSeqAIJInvalidateDiagonal(Y);
3554:   return 0;
3555: }

3557: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3558: {
3559:   PetscBool   both = PETSC_FALSE;
3560:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3562:   if (A->factortype == MAT_FACTOR_NONE) {
3563:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3564:     if (spptr->mat) {
3565:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3566:       if (matrix->values) {
3567:         both = PETSC_TRUE;
3568:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3569:       }
3570:     }
3571:     if (spptr->matTranspose) {
3572:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3573:       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3574:     }
3575:   }
3576:   PetscArrayzero(a->a, a->i[A->rmap->n]);
3577:   MatSeqAIJInvalidateDiagonal(A);
3578:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3579:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3580:   return 0;
3581: }

3583: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3584: {
3585:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3587:   if (A->factortype != MAT_FACTOR_NONE) {
3588:     A->boundtocpu = flg;
3589:     return 0;
3590:   }
3591:   if (flg) {
3592:     MatSeqAIJCUSPARSECopyFromGPU(A);

3594:     A->ops->scale                     = MatScale_SeqAIJ;
3595:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3596:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3597:     A->ops->mult                      = MatMult_SeqAIJ;
3598:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3599:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3600:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3601:     A->ops->multhermitiantranspose    = NULL;
3602:     A->ops->multhermitiantransposeadd = NULL;
3603:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3604:     PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps));
3605:     PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3606:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL);
3607:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL);
3608:     PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3609:     PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3610:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL);
3611:   } else {
3612:     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3613:     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3614:     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3615:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3616:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3617:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3618:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3619:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3620:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3621:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3622:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3623:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3624:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3625:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3626:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3627:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3628:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;

3630:     PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3631:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3632:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3633:     PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE);
3634:     PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE);
3635:     PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3636:   }
3637:   A->boundtocpu = flg;
3638:   if (flg && a->inode.size) {
3639:     a->inode.use = PETSC_TRUE;
3640:   } else {
3641:     a->inode.use = PETSC_FALSE;
3642:   }
3643:   return 0;
3644: }

3646: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3647: {
3648:   Mat B;

3650:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* first use of CUSPARSE may be via MatConvert */
3651:   if (reuse == MAT_INITIAL_MATRIX) {
3652:     MatDuplicate(A, MAT_COPY_VALUES, newmat);
3653:   } else if (reuse == MAT_REUSE_MATRIX) {
3654:     MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
3655:   }
3656:   B = *newmat;

3658:   PetscFree(B->defaultvectype);
3659:   PetscStrallocpy(VECCUDA, &B->defaultvectype);

3661:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3662:     if (B->factortype == MAT_FACTOR_NONE) {
3663:       Mat_SeqAIJCUSPARSE *spptr;
3664:       PetscNew(&spptr);
3665:       cusparseCreate(&spptr->handle);
3666:       cusparseSetStream(spptr->handle, PetscDefaultCudaStream);
3667:       spptr->format = MAT_CUSPARSE_CSR;
3668: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669:   #if CUSPARSE_VERSION > 11301
3670:       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3671:   #else
3672:       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3673:   #endif
3674:       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3675:       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3676: #endif
3677:       B->spptr = spptr;
3678:     } else {
3679:       Mat_SeqAIJCUSPARSETriFactors *spptr;

3681:       PetscNew(&spptr);
3682:       cusparseCreate(&spptr->handle);
3683:       cusparseSetStream(spptr->handle, PetscDefaultCudaStream);
3684:       B->spptr = spptr;
3685:     }
3686:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3687:   }
3688:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3689:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3690:   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3691:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3692:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3693:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;

3695:   MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE);
3696:   PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE);
3697:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3698: #if defined(PETSC_HAVE_HYPRE)
3699:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE);
3700: #endif
3701:   PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);
3702:   return 0;
3703: }

3705: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3706: {
3707:   MatCreate_SeqAIJ(B);
3708:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B);
3709:   return 0;
3710: }

3712: /*MC
3713:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

3715:    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3716:    CSR, ELL, or Hybrid format.
3717:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

3719:    Options Database Keys:
3720: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3721: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3722: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3723: +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU

3725:   Level: beginner

3727: .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3728: M*/

3730: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);

3732: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3733: {
3734:   MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band);
3735:   MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse);
3736:   MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse);
3737:   MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse);
3738:   MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse);

3740:   return 0;
3741: }

3743: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3744: {
3745:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;

3747:   if (!cusp) return 0;
3748:   delete cusp->cooPerm;
3749:   delete cusp->cooPerm_a;
3750:   cusp->cooPerm   = NULL;
3751:   cusp->cooPerm_a = NULL;
3752:   if (cusp->use_extended_coo) {
3753:     cudaFree(cusp->jmap_d);
3754:     cudaFree(cusp->perm_d);
3755:   }
3756:   cusp->use_extended_coo = PETSC_FALSE;
3757:   return 0;
3758: }

3760: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3761: {
3762:   if (*cusparsestruct) {
3763:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format);
3764:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format);
3765:     delete (*cusparsestruct)->workVector;
3766:     delete (*cusparsestruct)->rowoffsets_gpu;
3767:     delete (*cusparsestruct)->cooPerm;
3768:     delete (*cusparsestruct)->cooPerm_a;
3769:     delete (*cusparsestruct)->csr2csc_i;
3770:     if ((*cusparsestruct)->handle) cusparseDestroy((*cusparsestruct)->handle);
3771:     if ((*cusparsestruct)->jmap_d) cudaFree((*cusparsestruct)->jmap_d);
3772:     if ((*cusparsestruct)->perm_d) cudaFree((*cusparsestruct)->perm_d);
3773:     PetscFree(*cusparsestruct);
3774:   }
3775:   return 0;
3776: }

3778: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3779: {
3780:   if (*mat) {
3781:     delete (*mat)->values;
3782:     delete (*mat)->column_indices;
3783:     delete (*mat)->row_offsets;
3784:     delete *mat;
3785:     *mat = 0;
3786:   }
3787:   return 0;
3788: }

3790: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3791: {
3792:   if (*trifactor) {
3793:     if ((*trifactor)->descr) cusparseDestroyMatDescr((*trifactor)->descr);
3794:     if ((*trifactor)->solveInfo) cusparseDestroyCsrsvInfo((*trifactor)->solveInfo);
3795:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
3796:     if ((*trifactor)->solveBuffer) cudaFree((*trifactor)->solveBuffer);
3797:     if ((*trifactor)->AA_h) cudaFreeHost((*trifactor)->AA_h);
3798: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3799:     if ((*trifactor)->csr2cscBuffer) cudaFree((*trifactor)->csr2cscBuffer);
3800: #endif
3801:     PetscFree(*trifactor);
3802:   }
3803:   return 0;
3804: }

3806: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3807: {
3808:   CsrMatrix *mat;

3810:   if (*matstruct) {
3811:     if ((*matstruct)->mat) {
3812:       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3813: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3814:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3815: #else
3816:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3817:         cusparseDestroyHybMat(hybMat);
3818: #endif
3819:       } else {
3820:         mat = (CsrMatrix *)(*matstruct)->mat;
3821:         CsrMatrix_Destroy(&mat);
3822:       }
3823:     }
3824:     if ((*matstruct)->descr) cusparseDestroyMatDescr((*matstruct)->descr);
3825:     delete (*matstruct)->cprowIndices;
3826:     if ((*matstruct)->alpha_one) cudaFree((*matstruct)->alpha_one);
3827:     if ((*matstruct)->beta_zero) cudaFree((*matstruct)->beta_zero);
3828:     if ((*matstruct)->beta_one) cudaFree((*matstruct)->beta_one);

3830: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3831:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3832:     if (mdata->matDescr) cusparseDestroySpMat(mdata->matDescr);
3833:     for (int i = 0; i < 3; i++) {
3834:       if (mdata->cuSpMV[i].initialized) {
3835:         cudaFree(mdata->cuSpMV[i].spmvBuffer);
3836:         cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);
3837:         cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);
3838:       }
3839:     }
3840: #endif
3841:     delete *matstruct;
3842:     *matstruct = NULL;
3843:   }
3844:   return 0;
3845: }

3847: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3848: {
3849:   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;

3851:   if (fs) {
3852:     MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr);
3853:     MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr);
3854:     MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose);
3855:     MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose);
3856:     delete fs->rpermIndices;
3857:     delete fs->cpermIndices;
3858:     delete fs->workVector;
3859:     fs->rpermIndices = NULL;
3860:     fs->cpermIndices = NULL;
3861:     fs->workVector   = NULL;
3862:     if (fs->a_band_d) cudaFree(fs->a_band_d);
3863:     if (fs->i_band_d) cudaFree(fs->i_band_d);
3864:     fs->init_dev_prop = PETSC_FALSE;
3865: #if CUSPARSE_VERSION >= 11500
3866:     cudaFree(fs->csrRowPtr);
3867:     cudaFree(fs->csrColIdx);
3868:     cudaFree(fs->csrVal);
3869:     cudaFree(fs->X);
3870:     cudaFree(fs->Y);
3871:     // cudaFree(fs->factBuffer_M); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3872:     cudaFree(fs->spsvBuffer_L);
3873:     cudaFree(fs->spsvBuffer_U);
3874:     cudaFree(fs->spsvBuffer_Lt);
3875:     cudaFree(fs->spsvBuffer_Ut);
3876:     cusparseDestroyMatDescr(fs->matDescr_M);
3877:     cusparseDestroySpMat(fs->spMatDescr_L);
3878:     cusparseDestroySpMat(fs->spMatDescr_U);
3879:     cusparseSpSV_destroyDescr(fs->spsvDescr_L);
3880:     cusparseSpSV_destroyDescr(fs->spsvDescr_Lt);
3881:     cusparseSpSV_destroyDescr(fs->spsvDescr_U);
3882:     cusparseSpSV_destroyDescr(fs->spsvDescr_Ut);
3883:     cusparseDestroyDnVec(fs->dnVecDescr_X);
3884:     cusparseDestroyDnVec(fs->dnVecDescr_Y);
3885:     cusparseDestroyCsrilu02Info(fs->ilu0Info_M);
3886:     cusparseDestroyCsric02Info(fs->ic0Info_M);

3888:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3889:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3890: #endif
3891:   }
3892:   return 0;
3893: }

3895: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3896: {
3897:   cusparseHandle_t handle;

3899:   if (*trifactors) {
3900:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3901:     if (handle = (*trifactors)->handle) cusparseDestroy(handle);
3902:     PetscFree(*trifactors);
3903:   }
3904:   return 0;
3905: }

3907: struct IJCompare {
3908:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3909:   {
3910:     if (t1.get<0>() < t2.get<0>()) return true;
3911:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3912:     return false;
3913:   }
3914: };

3916: struct IJEqual {
3917:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3918:   {
3919:     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3920:     return true;
3921:   }
3922: };

3924: struct IJDiff {
3925:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3926: };

3928: struct IJSum {
3929:   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3930: };

3932: #include <thrust/iterator/discard_iterator.h>
3933: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3934: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3935: {
3936:   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3937:   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3938:   THRUSTARRAY                          *cooPerm_v = NULL;
3939:   thrust::device_ptr<const PetscScalar> d_v;
3940:   CsrMatrix                            *matrix;
3941:   PetscInt                              n;

3945:   if (!cusp->cooPerm) {
3946:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
3947:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
3948:     return 0;
3949:   }
3950:   matrix = (CsrMatrix *)cusp->mat->mat;
3952:   if (!v) {
3953:     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3954:     goto finalize;
3955:   }
3956:   n = cusp->cooPerm->size();
3957:   if (isCudaMem(v)) {
3958:     d_v = thrust::device_pointer_cast(v);
3959:   } else {
3960:     cooPerm_v = new THRUSTARRAY(n);
3961:     cooPerm_v->assign(v, v + n);
3962:     d_v = cooPerm_v->data();
3963:     PetscLogCpuToGpu(n * sizeof(PetscScalar));
3964:   }
3965:   PetscLogGpuTimeBegin();
3966:   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3967:     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
3968:       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3969:       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3970:       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3971:         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3972:         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3973:       */
3974:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3975:       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3976:       delete cooPerm_w;
3977:     } else {
3978:       /* all nonzeros in d_v[] are unique entries */
3979:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3980:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3981:       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3982:     }
3983:   } else {
3984:     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3985:       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3986:       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3987:     } else {
3988:       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3989:       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3990:       thrust::for_each(zibit, zieit, VecCUDAEquals());
3991:     }
3992:   }
3993:   PetscLogGpuTimeEnd();
3994: finalize:
3995:   delete cooPerm_v;
3996:   A->offloadmask = PETSC_OFFLOAD_GPU;
3997:   PetscObjectStateIncrease((PetscObject)A);
3998:   /* shorter version of MatAssemblyEnd_SeqAIJ */
3999:   PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz);
4000:   PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n");
4001:   PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax);
4002:   a->reallocs = 0;
4003:   A->info.mallocs += 0;
4004:   A->info.nz_unneeded = 0;
4005:   A->assembled = A->was_assembled = PETSC_TRUE;
4006:   A->num_ass++;
4007:   return 0;
4008: }

4010: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4011: {
4012:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;

4015:   if (!cusp) return 0;
4016:   if (destroy) {
4017:     MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format);
4018:     delete cusp->csr2csc_i;
4019:     cusp->csr2csc_i = NULL;
4020:   }
4021:   A->transupdated = PETSC_FALSE;
4022:   return 0;
4023: }

4025: #include <thrust/binary_search.h>
4026: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4027: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4028: {
4029:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4030:   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
4031:   PetscInt            cooPerm_n, nzr = 0;

4033:   PetscLayoutSetUp(A->rmap);
4034:   PetscLayoutSetUp(A->cmap);
4035:   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4036:   if (n != cooPerm_n) {
4037:     delete cusp->cooPerm;
4038:     delete cusp->cooPerm_a;
4039:     cusp->cooPerm   = NULL;
4040:     cusp->cooPerm_a = NULL;
4041:   }
4042:   if (n) {
4043:     thrust::device_ptr<PetscInt> d_i, d_j;
4044:     PetscInt                    *d_raw_i, *d_raw_j;
4045:     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4046:     PetscMemType                 imtype, jmtype;

4048:     PetscGetMemType(coo_i, &imtype);
4049:     if (PetscMemTypeHost(imtype)) {
4050:       cudaMalloc(&d_raw_i, sizeof(PetscInt) * n);
4051:       cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
4052:       d_i        = thrust::device_pointer_cast(d_raw_i);
4053:       free_raw_i = PETSC_TRUE;
4054:       PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
4055:     } else {
4056:       d_i = thrust::device_pointer_cast(coo_i);
4057:     }

4059:     PetscGetMemType(coo_j, &jmtype);
4060:     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4061:       cudaMalloc(&d_raw_j, sizeof(PetscInt) * n);
4062:       cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
4063:       d_j        = thrust::device_pointer_cast(d_raw_j);
4064:       free_raw_j = PETSC_TRUE;
4065:       PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
4066:     } else {
4067:       d_j = thrust::device_pointer_cast(coo_j);
4068:     }

4070:     THRUSTINTARRAY ii(A->rmap->n);

4072:     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4073:     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);

4075:     /* Ex.
4076:       n = 6
4077:       coo_i = [3,3,1,4,1,4]
4078:       coo_j = [3,2,2,5,2,6]
4079:     */
4080:     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4081:     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));

4083:     PetscLogGpuTimeBegin();
4084:     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4085:     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4086:     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4087:     THRUSTINTARRAY w(d_j, d_j + n);

4089:     /*
4090:       d_i     = [1,1,3,3,4,4]
4091:       d_j     = [2,2,2,3,5,6]
4092:       cooPerm = [2,4,1,0,3,5]
4093:     */
4094:     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */

4096:     /*
4097:       d_i     = [1,3,3,4,4,x]
4098:                             ^ekey
4099:       d_j     = [2,2,3,5,6,x]
4100:                            ^nekye
4101:     */
4102:     if (nekey == ekey) { /* all entries are unique */
4103:       delete cusp->cooPerm_a;
4104:       cusp->cooPerm_a = NULL;
4105:     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4106:       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4107:       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4108:       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4109:       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4110:       w[0]                  = 0;
4111:       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4112:       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4113:     }
4114:     thrust::counting_iterator<PetscInt> search_begin(0);
4115:     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4116:                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4117:                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4118:     PetscLogGpuTimeEnd();

4120:     MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
4121:     a->singlemalloc = PETSC_FALSE;
4122:     a->free_a       = PETSC_TRUE;
4123:     a->free_ij      = PETSC_TRUE;
4124:     PetscMalloc1(A->rmap->n + 1, &a->i);
4125:     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4126:     cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4127:     a->nz = a->maxnz = a->i[A->rmap->n];
4128:     a->rmax          = 0;
4129:     PetscMalloc1(a->nz, &a->a);
4130:     PetscMalloc1(a->nz, &a->j);
4131:     cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4132:     if (!a->ilen) PetscMalloc1(A->rmap->n, &a->ilen);
4133:     if (!a->imax) PetscMalloc1(A->rmap->n, &a->imax);
4134:     for (PetscInt i = 0; i < A->rmap->n; i++) {
4135:       const PetscInt nnzr = a->i[i + 1] - a->i[i];
4136:       nzr += (PetscInt) !!(nnzr);
4137:       a->ilen[i] = a->imax[i] = nnzr;
4138:       a->rmax                 = PetscMax(a->rmax, nnzr);
4139:     }
4140:     a->nonzerorowcnt = nzr;
4141:     A->preallocated  = PETSC_TRUE;
4142:     PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt));
4143:     MatMarkDiagonal_SeqAIJ(A);
4144:     if (free_raw_i) cudaFree(d_raw_i);
4145:     if (free_raw_j) cudaFree(d_raw_j);
4146:   } else {
4147:     MatSeqAIJSetPreallocation(A, 0, NULL);
4148:   }
4149:   MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);

4151:   /* We want to allocate the CUSPARSE struct for matvec now.
4152:      The code is so convoluted now that I prefer to copy zeros */
4153:   PetscArrayzero(a->a, a->nz);
4154:   MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6);
4155:   A->offloadmask = PETSC_OFFLOAD_CPU;
4156:   MatSeqAIJCUSPARSECopyToGPU(A);
4157:   MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
4158:   return 0;
4159: }

4161: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4162: {
4163:   Mat_SeqAIJ         *seq;
4164:   Mat_SeqAIJCUSPARSE *dev;
4165:   PetscBool           coo_basic = PETSC_TRUE;
4166:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;

4168:   MatResetPreallocationCOO_SeqAIJ(mat);
4169:   MatResetPreallocationCOO_SeqAIJCUSPARSE(mat);
4170:   if (coo_i) {
4171:     PetscGetMemType(coo_i, &mtype);
4172:     if (PetscMemTypeHost(mtype)) {
4173:       for (PetscCount k = 0; k < coo_n; k++) {
4174:         if (coo_i[k] < 0 || coo_j[k] < 0) {
4175:           coo_basic = PETSC_FALSE;
4176:           break;
4177:         }
4178:       }
4179:     }
4180:   }

4182:   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4183:     MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j);
4184:   } else {
4185:     MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
4186:     mat->offloadmask = PETSC_OFFLOAD_CPU;
4187:     MatSeqAIJCUSPARSECopyToGPU(mat);
4188:     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4189:     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4190:     cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount));
4191:     cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
4192:     cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount));
4193:     cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice);
4194:     dev->use_extended_coo = PETSC_TRUE;
4195:   }
4196:   return 0;
4197: }

4199: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4200: {
4201:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4202:   const PetscCount grid_size = gridDim.x * blockDim.x;
4203:   for (; i < nnz; i += grid_size) {
4204:     PetscScalar sum = 0.0;
4205:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4206:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4207:   }
4208: }

4210: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4211: {
4212:   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4213:   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4214:   PetscCount          Annz = seq->nz;
4215:   PetscMemType        memtype;
4216:   const PetscScalar  *v1 = v;
4217:   PetscScalar        *Aa;

4219:   if (dev->use_extended_coo) {
4220:     PetscGetMemType(v, &memtype);
4221:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4222:       cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar));
4223:       cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
4224:     }

4226:     if (imode == INSERT_VALUES) MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa);
4227:     else MatSeqAIJCUSPARSEGetArray(A, &Aa);

4229:     if (Annz) {
4230:       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4231:       cudaPeekAtLastError();
4232:     }

4234:     if (imode == INSERT_VALUES) MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa);
4235:     else MatSeqAIJCUSPARSERestoreArray(A, &Aa);

4237:     if (PetscMemTypeHost(memtype)) cudaFree((void *)v1);
4238:   } else {
4239:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode);
4240:   }
4241:   return 0;
4242: }

4244: /*@C
4245:     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.

4247:    Not collective

4249:     Input Parameters:
4250: +   A - the matrix
4251: -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4253:     Output Parameters:
4254: +   ia - the CSR row pointers
4255: -   ja - the CSR column indices

4257:     Level: developer

4259:     Note:
4260:       When compressed is true, the CSR structure does not contain empty rows

4262: .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4263: @*/
4264: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4265: {
4266:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4267:   CsrMatrix          *csr;
4268:   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;

4271:   if (!i || !j) return 0;
4274:   MatSeqAIJCUSPARSECopyToGPU(A);
4276:   csr = (CsrMatrix *)cusp->mat->mat;
4277:   if (i) {
4278:     if (!compressed && a->compressedrow.use) { /* need full row offset */
4279:       if (!cusp->rowoffsets_gpu) {
4280:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4281:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4282:         PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4283:       }
4284:       *i = cusp->rowoffsets_gpu->data().get();
4285:     } else *i = csr->row_offsets->data().get();
4286:   }
4287:   if (j) *j = csr->column_indices->data().get();
4288:   return 0;
4289: }

4291: /*@C
4292:     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`

4294:    Not collective

4296:     Input Parameters:
4297: +   A - the matrix
4298: -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

4300:     Output Parameters:
4301: +   ia - the CSR row pointers
4302: -   ja - the CSR column indices

4304:     Level: developer

4306: .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4307: @*/
4308: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4309: {
4312:   if (i) *i = NULL;
4313:   if (j) *j = NULL;
4314:   return 0;
4315: }

4317: /*@C
4318:    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4320:    Not Collective

4322:    Input Parameter:
4323: .   A - a `MATSEQAIJCUSPARSE` matrix

4325:    Output Parameter:
4326: .   a - pointer to the device data

4328:    Level: developer

4330:    Note:
4331:    May trigger host-device copies if up-to-date matrix data is on host

4333: .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4334: @*/
4335: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4336: {
4337:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4338:   CsrMatrix          *csr;

4344:   MatSeqAIJCUSPARSECopyToGPU(A);
4346:   csr = (CsrMatrix *)cusp->mat->mat;
4348:   *a = csr->values->data().get();
4349:   return 0;
4350: }

4352: /*@C
4353:    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`

4355:    Not Collective

4357:    Input Parameter:
4358: .   A - a `MATSEQAIJCUSPARSE` matrix

4360:    Output Parameter:
4361: .   a - pointer to the device data

4363:    Level: developer

4365: .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4366: @*/
4367: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4368: {
4372:   *a = NULL;
4373:   return 0;
4374: }

4376: /*@C
4377:    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4379:    Not Collective

4381:    Input Parameter:
4382: .   A - a `MATSEQAIJCUSPARSE` matrix

4384:    Output Parameter:
4385: .   a - pointer to the device data

4387:    Level: developer

4389:    Note:
4390:    May trigger host-device copies if up-to-date matrix data is on host

4392: .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4393: @*/
4394: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4395: {
4396:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397:   CsrMatrix          *csr;

4403:   MatSeqAIJCUSPARSECopyToGPU(A);
4405:   csr = (CsrMatrix *)cusp->mat->mat;
4407:   *a             = csr->values->data().get();
4408:   A->offloadmask = PETSC_OFFLOAD_GPU;
4409:   MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
4410:   return 0;
4411: }
4412: /*@C
4413:    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`

4415:    Not Collective

4417:    Input Parameter:
4418: .   A - a `MATSEQAIJCUSPARSE` matrix

4420:    Output Parameter:
4421: .   a - pointer to the device data

4423:    Level: developer

4425: .seealso: `MatSeqAIJCUSPARSEGetArray()`
4426: @*/
4427: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4428: {
4432:   MatSeqAIJInvalidateDiagonal(A);
4433:   PetscObjectStateIncrease((PetscObject)A);
4434:   *a = NULL;
4435:   return 0;
4436: }

4438: /*@C
4439:    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored

4441:    Not Collective

4443:    Input Parameter:
4444: .   A - a `MATSEQAIJCUSPARSE` matrix

4446:    Output Parameter:
4447: .   a - pointer to the device data

4449:    Level: developer

4451:    Note:
4452:    Does not trigger host-device copies and flags data validity on the GPU

4454: .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4455: @*/
4456: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4457: {
4458:   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4459:   CsrMatrix          *csr;

4466:   csr = (CsrMatrix *)cusp->mat->mat;
4468:   *a             = csr->values->data().get();
4469:   A->offloadmask = PETSC_OFFLOAD_GPU;
4470:   MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
4471:   return 0;
4472: }

4474: /*@C
4475:    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`

4477:    Not Collective

4479:    Input Parameter:
4480: .   A - a `MATSEQAIJCUSPARSE` matrix

4482:    Output Parameter:
4483: .   a - pointer to the device data

4485:    Level: developer

4487: .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4488: @*/
4489: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4490: {
4494:   MatSeqAIJInvalidateDiagonal(A);
4495:   PetscObjectStateIncrease((PetscObject)A);
4496:   *a = NULL;
4497:   return 0;
4498: }

4500: struct IJCompare4 {
4501:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4502:   {
4503:     if (t1.get<0>() < t2.get<0>()) return true;
4504:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4505:     return false;
4506:   }
4507: };

4509: struct Shift {
4510:   int _shift;

4512:   Shift(int shift) : _shift(shift) { }
4513:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4514: };

4516: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4517: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4518: {
4519:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4520:   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4521:   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4522:   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4523:   PetscInt                      Annz, Bnnz;
4524:   cusparseStatus_t              stat;
4525:   PetscInt                      i, m, n, zero = 0;

4536:   if (reuse == MAT_INITIAL_MATRIX) {
4537:     m = A->rmap->n;
4538:     n = A->cmap->n + B->cmap->n;
4539:     MatCreate(PETSC_COMM_SELF, C);
4540:     MatSetSizes(*C, m, n, m, n);
4541:     MatSetType(*C, MATSEQAIJCUSPARSE);
4542:     c                       = (Mat_SeqAIJ *)(*C)->data;
4543:     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4544:     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4545:     Ccsr                    = new CsrMatrix;
4546:     Cmat->cprowIndices      = NULL;
4547:     c->compressedrow.use    = PETSC_FALSE;
4548:     c->compressedrow.nrows  = 0;
4549:     c->compressedrow.i      = NULL;
4550:     c->compressedrow.rindex = NULL;
4551:     Ccusp->workVector       = NULL;
4552:     Ccusp->nrows            = m;
4553:     Ccusp->mat              = Cmat;
4554:     Ccusp->mat->mat         = Ccsr;
4555:     Ccsr->num_rows          = m;
4556:     Ccsr->num_cols          = n;
4557:     cusparseCreateMatDescr(&Cmat->descr);
4558:     cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
4559:     cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4560:     cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
4561:     cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
4562:     cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4563:     cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4564:     cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4565:     cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4566:     MatSeqAIJCUSPARSECopyToGPU(A);
4567:     MatSeqAIJCUSPARSECopyToGPU(B);

4571:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4572:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4573:     Annz                 = (PetscInt)Acsr->column_indices->size();
4574:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4575:     c->nz                = Annz + Bnnz;
4576:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4577:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4578:     Ccsr->values         = new THRUSTARRAY(c->nz);
4579:     Ccsr->num_entries    = c->nz;
4580:     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4581:     if (c->nz) {
4582:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4583:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4584:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4585:       THRUSTINTARRAY32 *Aroff, *Broff;

4587:       if (a->compressedrow.use) { /* need full row offset */
4588:         if (!Acusp->rowoffsets_gpu) {
4589:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4590:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4591:           PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4592:         }
4593:         Aroff = Acusp->rowoffsets_gpu;
4594:       } else Aroff = Acsr->row_offsets;
4595:       if (b->compressedrow.use) { /* need full row offset */
4596:         if (!Bcusp->rowoffsets_gpu) {
4597:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4598:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4599:           PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
4600:         }
4601:         Broff = Bcusp->rowoffsets_gpu;
4602:       } else Broff = Bcsr->row_offsets;
4603:       PetscLogGpuTimeBegin();
4604:       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4605:       stat;
4606:       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4607:       stat;
4608:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4609:       auto Aperm = thrust::make_constant_iterator(1);
4610:       auto Bperm = thrust::make_constant_iterator(0);
4611: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4612:       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4613:       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4614: #else
4615:       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4616:       auto Bcib = Bcsr->column_indices->begin();
4617:       auto Bcie = Bcsr->column_indices->end();
4618:       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4619: #endif
4620:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4621:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4622:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4623:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4624:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4625:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4626:       auto p1    = Ccusp->cooPerm->begin();
4627:       auto p2    = Ccusp->cooPerm->begin();
4628:       thrust::advance(p2, Annz);
4629:       thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4());
4630: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4631:       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4632: #endif
4633:       auto cci = thrust::make_counting_iterator(zero);
4634:       auto cce = thrust::make_counting_iterator(c->nz);
4635: #if 0 //Errors on SUMMIT cuda 11.1.0
4636:       thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>());
4637: #else
4638:       auto pred = thrust::identity<int>();
4639:       thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred);
4640:       thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred);
4641: #endif
4642:       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4643:       stat;
4644:       PetscLogGpuTimeEnd();
4645:       delete wPerm;
4646:       delete Acoo;
4647:       delete Bcoo;
4648:       delete Ccoo;
4649: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4650:       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4651:       stat;
4652: #endif
4653:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4654:         MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4655:         MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4656:         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4657:         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4658:         CsrMatrix                    *CcsrT = new CsrMatrix;
4659:         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4660:         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4662:         (*C)->form_explicit_transpose = PETSC_TRUE;
4663:         (*C)->transupdated            = PETSC_TRUE;
4664:         Ccusp->rowoffsets_gpu         = NULL;
4665:         CmatT->cprowIndices           = NULL;
4666:         CmatT->mat                    = CcsrT;
4667:         CcsrT->num_rows               = n;
4668:         CcsrT->num_cols               = m;
4669:         CcsrT->num_entries            = c->nz;

4671:         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4672:         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4673:         CcsrT->values         = new THRUSTARRAY(c->nz);

4675:         PetscLogGpuTimeBegin();
4676:         auto rT = CcsrT->row_offsets->begin();
4677:         if (AT) {
4678:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4679:           thrust::advance(rT, -1);
4680:         }
4681:         if (BT) {
4682:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4683:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4684:           thrust::copy(titb, tite, rT);
4685:         }
4686:         auto cT = CcsrT->column_indices->begin();
4687:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4688:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4689:         auto vT = CcsrT->values->begin();
4690:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4691:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4692:         PetscLogGpuTimeEnd();

4694:         cusparseCreateMatDescr(&CmatT->descr);
4695:         cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);
4696:         cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4697:         cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar));
4698:         cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar));
4699:         cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4700:         cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4701:         cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4702:         cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4703: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4704:         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4705:         stat;
4706: #endif
4707:         Ccusp->matTranspose = CmatT;
4708:       }
4709:     }

4711:     c->singlemalloc = PETSC_FALSE;
4712:     c->free_a       = PETSC_TRUE;
4713:     c->free_ij      = PETSC_TRUE;
4714:     PetscMalloc1(m + 1, &c->i);
4715:     PetscMalloc1(c->nz, &c->j);
4716:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4717:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4718:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4719:       ii = *Ccsr->row_offsets;
4720:       jj = *Ccsr->column_indices;
4721:       cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4722:       cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4723:     } else {
4724:       cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4725:       cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4726:     }
4727:     PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
4728:     PetscMalloc1(m, &c->ilen);
4729:     PetscMalloc1(m, &c->imax);
4730:     c->maxnz         = c->nz;
4731:     c->nonzerorowcnt = 0;
4732:     c->rmax          = 0;
4733:     for (i = 0; i < m; i++) {
4734:       const PetscInt nn = c->i[i + 1] - c->i[i];
4735:       c->ilen[i] = c->imax[i] = nn;
4736:       c->nonzerorowcnt += (PetscInt) !!nn;
4737:       c->rmax = PetscMax(c->rmax, nn);
4738:     }
4739:     MatMarkDiagonal_SeqAIJ(*C);
4740:     PetscMalloc1(c->nz, &c->a);
4741:     (*C)->nonzerostate++;
4742:     PetscLayoutSetUp((*C)->rmap);
4743:     PetscLayoutSetUp((*C)->cmap);
4744:     Ccusp->nonzerostate = (*C)->nonzerostate;
4745:     (*C)->preallocated  = PETSC_TRUE;
4746:   } else {
4748:     c = (Mat_SeqAIJ *)(*C)->data;
4749:     if (c->nz) {
4750:       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4754:       MatSeqAIJCUSPARSECopyToGPU(A);
4755:       MatSeqAIJCUSPARSECopyToGPU(B);
4758:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4759:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4760:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4766:       auto pmid = Ccusp->cooPerm->begin();
4767:       thrust::advance(pmid, Acsr->num_entries);
4768:       PetscLogGpuTimeBegin();
4769:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4770:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4771:       thrust::for_each(zibait, zieait, VecCUDAEquals());
4772:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4773:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4774:       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4775:       MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE);
4776:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4778:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4779:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4780:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4781:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4782:         auto       vT    = CcsrT->values->begin();
4783:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4784:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4785:         (*C)->transupdated = PETSC_TRUE;
4786:       }
4787:       PetscLogGpuTimeEnd();
4788:     }
4789:   }
4790:   PetscObjectStateIncrease((PetscObject)*C);
4791:   (*C)->assembled     = PETSC_TRUE;
4792:   (*C)->was_assembled = PETSC_FALSE;
4793:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4794:   return 0;
4795: }

4797: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4798: {
4799:   bool               dmem;
4800:   const PetscScalar *av;

4802:   dmem = isCudaMem(v);
4803:   MatSeqAIJCUSPARSEGetArrayRead(A, &av);
4804:   if (n && idx) {
4805:     THRUSTINTARRAY widx(n);
4806:     widx.assign(idx, idx + n);
4807:     PetscLogCpuToGpu(n * sizeof(PetscInt));

4809:     THRUSTARRAY                    *w = NULL;
4810:     thrust::device_ptr<PetscScalar> dv;
4811:     if (dmem) {
4812:       dv = thrust::device_pointer_cast(v);
4813:     } else {
4814:       w  = new THRUSTARRAY(n);
4815:       dv = w->data();
4816:     }
4817:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4819:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4820:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4821:     thrust::for_each(zibit, zieit, VecCUDAEquals());
4822:     if (w) cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
4823:     delete w;
4824:   } else {
4825:     cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
4826:   }
4827:   if (!dmem) PetscLogCpuToGpu(n * sizeof(PetscScalar));
4828:   MatSeqAIJCUSPARSERestoreArrayRead(A, &av);
4829:   return 0;
4830: }