Actual source code: aijhipsparse.hip.cxx

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the HIPSPARSE library,
  4:   Portions of this code are under:
  5:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  6: */
  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/mat/impls/dense/seq/dense.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 15: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
 16: #include <thrust/adjacent_difference.h>
 17: #include <thrust/iterator/transform_iterator.h>
 18: #if PETSC_CPP_VERSION >= 14
 19:   #define PETSC_HAVE_THRUST_ASYNC 1
 20:   #include <thrust/async/for_each.h>
 21: #endif
 22: #include <thrust/iterator/constant_iterator.h>
 23: #include <thrust/iterator/discard_iterator.h>
 24: #include <thrust/binary_search.h>
 25: #include <thrust/remove.h>
 26: #include <thrust/sort.h>
 27: #include <thrust/unique.h>

 29: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
 30: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
 31: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
 32: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};

 34: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 35: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 36: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 37: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 38: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 41: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 43: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 44: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 45: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
 46: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
 47: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 48: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 49: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 50: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 51: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 52: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 53: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 54: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
 55: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 56: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
 57: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
 58: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
 59: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
 60: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
 61: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
 62: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
 63: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 64: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 65: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);

 67: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
 68: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);

 70: /*
 71: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
 72: {
 73:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 75:   PetscFunctionBegin;
 76:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 77:   hipsparsestruct->stream = stream;
 78:   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
 79:   PetscFunctionReturn(PETSC_SUCCESS);
 80: }

 82: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
 83: {
 84:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 86:   PetscFunctionBegin;
 87:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 88:   if (hipsparsestruct->handle != handle) {
 89:     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
 90:     hipsparsestruct->handle = handle;
 91:   }
 92:   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
 93:   PetscFunctionReturn(PETSC_SUCCESS);
 94: }

 96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
 97: {
 98:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
 99:   PetscBool            flg;

101:   PetscFunctionBegin;
102:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
103:   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
104:   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
105:   PetscFunctionReturn(PETSC_SUCCESS);
106: }
107: */

109: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
110: {
111:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

113:   PetscFunctionBegin;
114:   switch (op) {
115:   case MAT_HIPSPARSE_MULT:
116:     hipsparsestruct->format = format;
117:     break;
118:   case MAT_HIPSPARSE_ALL:
119:     hipsparsestruct->format = format;
120:     break;
121:   default:
122:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
123:   }
124:   PetscFunctionReturn(PETSC_SUCCESS);
125: }

127: /*@
128:   MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
129:   operation. Only the `MatMult()` operation can use different GPU storage formats

131:   Not Collective

133:   Input Parameters:
134: + A      - Matrix of type `MATSEQAIJHIPSPARSE`
135: . op     - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
136:          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
137: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)

139:   Level: intermediate

141: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
144: {
145:   PetscFunctionBegin;
147:   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
148:   PetscFunctionReturn(PETSC_SUCCESS);
149: }

151: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
152: {
153:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

155:   PetscFunctionBegin;
156:   hipsparsestruct->use_cpu_solve = use_cpu;
157:   PetscFunctionReturn(PETSC_SUCCESS);
158: }

160: /*@
161:   MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.

163:   Input Parameters:
164: + A       - Matrix of type `MATSEQAIJHIPSPARSE`
165: - use_cpu - set flag for using the built-in CPU `MatSolve()`

167:   Level: intermediate

169:   Notes:
170:   The hipSparse LU solver currently computes the factors with the built-in CPU method
171:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
172:   This method to specifies if the solve is done on the CPU or GPU (GPU is the default).

174: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
175: @*/
176: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
177: {
178:   PetscFunctionBegin;
180:   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
181:   PetscFunctionReturn(PETSC_SUCCESS);
182: }

184: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
185: {
186:   PetscFunctionBegin;
187:   switch (op) {
188:   case MAT_FORM_EXPLICIT_TRANSPOSE:
189:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
190:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
191:     A->form_explicit_transpose = flg;
192:     break;
193:   default:
194:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
195:     break;
196:   }
197:   PetscFunctionReturn(PETSC_SUCCESS);
198: }

200: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
201: {
202:   PetscBool            row_identity, col_identity;
203:   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
204:   IS                   isrow = b->row, iscol = b->col;
205:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;

207:   PetscFunctionBegin;
208:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
209:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
210:   B->offloadmask = PETSC_OFFLOAD_CPU;
211:   /* determine which version of MatSolve needs to be used. */
212:   PetscCall(ISIdentity(isrow, &row_identity));
213:   PetscCall(ISIdentity(iscol, &col_identity));
214:   if (!hipsparsestruct->use_cpu_solve) {
215:     if (row_identity && col_identity) {
216:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
217:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
218:     } else {
219:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
220:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
221:     }
222:   }
223:   B->ops->matsolve          = NULL;
224:   B->ops->matsolvetranspose = NULL;

226:   /* get the triangular factors */
227:   if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
228:   PetscFunctionReturn(PETSC_SUCCESS);
229: }

231: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
232: {
233:   MatHIPSPARSEStorageFormat format;
234:   PetscBool                 flg;
235:   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

237:   PetscFunctionBegin;
238:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
239:   if (A->factortype == MAT_FACTOR_NONE) {
240:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
241:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
242:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
243:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
244:     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
245:     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
246:     PetscCall(
247:       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
248:     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
249:     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250:     PetscCall(
251:       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
252:     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253:     /*
254:     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
255:     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
256:     */
257:   }
258:   PetscOptionsHeadEnd();
259:   PetscFunctionReturn(PETSC_SUCCESS);
260: }

262: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
263: {
264:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
265:   PetscInt                            n                   = A->rmap->n;
266:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
267:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
268:   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
269:   const MatScalar                    *aa = a->a, *v;
270:   PetscInt                           *AiLo, *AjLo;
271:   PetscInt                            i, nz, nzLower, offset, rowOffset;

273:   PetscFunctionBegin;
274:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
275:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
276:     try {
277:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278:       nzLower = n + ai[n] - ai[1];
279:       if (!loTriFactor) {
280:         PetscScalar *AALo;
281:         PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));

283:         /* Allocate Space for the lower triangular matrix */
284:         PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
285:         PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));

287:         /* Fill the lower triangular matrix */
288:         AiLo[0]   = (PetscInt)0;
289:         AiLo[n]   = nzLower;
290:         AjLo[0]   = (PetscInt)0;
291:         AALo[0]   = (MatScalar)1.0;
292:         v         = aa;
293:         vi        = aj;
294:         offset    = 1;
295:         rowOffset = 1;
296:         for (i = 1; i < n; i++) {
297:           nz = ai[i + 1] - ai[i];
298:           /* additional 1 for the term on the diagonal */
299:           AiLo[i] = rowOffset;
300:           rowOffset += nz + 1;

302:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
303:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
304:           offset += nz;
305:           AjLo[offset] = (PetscInt)i;
306:           AALo[offset] = (MatScalar)1.0;
307:           offset += 1;
308:           v += nz;
309:           vi += nz;
310:         }

312:         /* allocate space for the triangular factor information */
313:         PetscCall(PetscNew(&loTriFactor));
314:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
315:         /* Create the matrix description */
316:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
317:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
318:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
319:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
320:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

322:         /* set the operation */
323:         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

325:         /* set the matrix */
326:         loTriFactor->csrMat                 = new CsrMatrix;
327:         loTriFactor->csrMat->num_rows       = n;
328:         loTriFactor->csrMat->num_cols       = n;
329:         loTriFactor->csrMat->num_entries    = nzLower;
330:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
331:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332:         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);

334:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
335:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
336:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

338:         /* Create the solve analysis information */
339:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
340:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
341:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
342:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
343:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

345:         /* perform the solve analysis */
346:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
347:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

349:         PetscCallHIP(WaitForHIP());
350:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

352:         /* assign the pointer */
353:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
354:         loTriFactor->AA_h                                           = AALo;
355:         PetscCallHIP(hipHostFree(AiLo));
356:         PetscCallHIP(hipHostFree(AjLo));
357:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
358:       } else { /* update values only */
359:         if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
360:         /* Fill the lower triangular matrix */
361:         loTriFactor->AA_h[0] = 1.0;
362:         v                    = aa;
363:         vi                   = aj;
364:         offset               = 1;
365:         for (i = 1; i < n; i++) {
366:           nz = ai[i + 1] - ai[i];
367:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
368:           offset += nz;
369:           loTriFactor->AA_h[offset] = 1.0;
370:           offset += 1;
371:           v += nz;
372:         }
373:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
374:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
375:       }
376:     } catch (char *ex) {
377:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
378:     }
379:   }
380:   PetscFunctionReturn(PETSC_SUCCESS);
381: }

383: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
386:   PetscInt                            n                   = A->rmap->n;
387:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
388:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
389:   const PetscInt                     *aj                  = a->j, *adiag, *vi;
390:   const MatScalar                    *aa                  = a->a, *v;
391:   PetscInt                           *AiUp, *AjUp;
392:   PetscInt                            i, nz, nzUpper, offset;

394:   PetscFunctionBegin;
395:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
396:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
397:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
398:     try {
399:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
400:       nzUpper = adiag[0] - adiag[n];
401:       if (!upTriFactor) {
402:         PetscScalar *AAUp;
403:         PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

405:         /* Allocate Space for the upper triangular matrix */
406:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
407:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

409:         /* Fill the upper triangular matrix */
410:         AiUp[0] = (PetscInt)0;
411:         AiUp[n] = nzUpper;
412:         offset  = nzUpper;
413:         for (i = n - 1; i >= 0; i--) {
414:           v  = aa + adiag[i + 1] + 1;
415:           vi = aj + adiag[i + 1] + 1;
416:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
417:           offset -= (nz + 1);               /* decrement the offset */

419:           /* first, set the diagonal elements */
420:           AjUp[offset] = (PetscInt)i;
421:           AAUp[offset] = (MatScalar)1. / v[nz];
422:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

424:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
425:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
426:         }

428:         /* allocate space for the triangular factor information */
429:         PetscCall(PetscNew(&upTriFactor));
430:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

432:         /* Create the matrix description */
433:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
434:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
435:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
436:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
437:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

439:         /* set the operation */
440:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

442:         /* set the matrix */
443:         upTriFactor->csrMat                 = new CsrMatrix;
444:         upTriFactor->csrMat->num_rows       = n;
445:         upTriFactor->csrMat->num_cols       = n;
446:         upTriFactor->csrMat->num_entries    = nzUpper;
447:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
448:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449:         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
450:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
451:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
452:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

454:         /* Create the solve analysis information */
455:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
456:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
457:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
458:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
459:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

461:         /* perform the solve analysis */
462:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
463:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

465:         PetscCallHIP(WaitForHIP());
466:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

468:         /* assign the pointer */
469:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
470:         upTriFactor->AA_h                                           = AAUp;
471:         PetscCallHIP(hipHostFree(AiUp));
472:         PetscCallHIP(hipHostFree(AjUp));
473:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
474:       } else {
475:         if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
476:         /* Fill the upper triangular matrix */
477:         offset = nzUpper;
478:         for (i = n - 1; i >= 0; i--) {
479:           v  = aa + adiag[i + 1] + 1;
480:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
481:           offset -= (nz + 1);               /* decrement the offset */

483:           /* first, set the diagonal elements */
484:           upTriFactor->AA_h[offset] = 1. / v[nz];
485:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
486:         }
487:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
488:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
489:       }
490:     } catch (char *ex) {
491:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
492:     }
493:   }
494:   PetscFunctionReturn(PETSC_SUCCESS);
495: }

497: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
498: {
499:   PetscBool                      row_identity, col_identity;
500:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
501:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
502:   IS                             isrow = a->row, iscol = a->icol;
503:   PetscInt                       n = A->rmap->n;

505:   PetscFunctionBegin;
506:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
507:   PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
508:   PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));

510:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
511:   hipsparseTriFactors->nnz = a->nz;

513:   A->offloadmask = PETSC_OFFLOAD_BOTH;
514:   /* lower triangular indices */
515:   PetscCall(ISIdentity(isrow, &row_identity));
516:   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
517:     const PetscInt *r;

519:     PetscCall(ISGetIndices(isrow, &r));
520:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
521:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
522:     PetscCall(ISRestoreIndices(isrow, &r));
523:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
524:   }
525:   /* upper triangular indices */
526:   PetscCall(ISIdentity(iscol, &col_identity));
527:   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
528:     const PetscInt *c;

530:     PetscCall(ISGetIndices(iscol, &c));
531:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
532:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
533:     PetscCall(ISRestoreIndices(iscol, &c));
534:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
535:   }
536:   PetscFunctionReturn(PETSC_SUCCESS);
537: }

539: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
540: {
541:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
542:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
543:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
544:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
545:   PetscInt                           *AiUp, *AjUp;
546:   PetscScalar                        *AAUp;
547:   PetscScalar                        *AALo;
548:   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
549:   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
550:   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
551:   const MatScalar                    *aa = b->a, *v;

553:   PetscFunctionBegin;
554:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
555:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
556:     try {
557:       PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
558:       PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
559:       if (!upTriFactor && !loTriFactor) {
560:         /* Allocate Space for the upper triangular matrix */
561:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
562:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

564:         /* Fill the upper triangular matrix */
565:         AiUp[0] = (PetscInt)0;
566:         AiUp[n] = nzUpper;
567:         offset  = 0;
568:         for (i = 0; i < n; i++) {
569:           /* set the pointers */
570:           v  = aa + ai[i];
571:           vj = aj + ai[i];
572:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

574:           /* first, set the diagonal elements */
575:           AjUp[offset] = (PetscInt)i;
576:           AAUp[offset] = (MatScalar)1.0 / v[nz];
577:           AiUp[i]      = offset;
578:           AALo[offset] = (MatScalar)1.0 / v[nz];

580:           offset += 1;
581:           if (nz > 0) {
582:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
583:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
584:             for (j = offset; j < offset + nz; j++) {
585:               AAUp[j] = -AAUp[j];
586:               AALo[j] = AAUp[j] / v[nz];
587:             }
588:             offset += nz;
589:           }
590:         }

592:         /* allocate space for the triangular factor information */
593:         PetscCall(PetscNew(&upTriFactor));
594:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

596:         /* Create the matrix description */
597:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
598:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
599:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
600:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
601:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

603:         /* set the matrix */
604:         upTriFactor->csrMat                 = new CsrMatrix;
605:         upTriFactor->csrMat->num_rows       = A->rmap->n;
606:         upTriFactor->csrMat->num_cols       = A->cmap->n;
607:         upTriFactor->csrMat->num_entries    = a->nz;
608:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
609:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610:         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
611:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
612:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
613:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

615:         /* set the operation */
616:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

618:         /* Create the solve analysis information */
619:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
620:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
621:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
622:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
623:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

625:         /* perform the solve analysis */
626:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
627:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

629:         PetscCallHIP(WaitForHIP());
630:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

632:         /* assign the pointer */
633:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

635:         /* allocate space for the triangular factor information */
636:         PetscCall(PetscNew(&loTriFactor));
637:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

639:         /* Create the matrix description */
640:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
641:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
642:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
643:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
644:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

646:         /* set the operation */
647:         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;

649:         /* set the matrix */
650:         loTriFactor->csrMat                 = new CsrMatrix;
651:         loTriFactor->csrMat->num_rows       = A->rmap->n;
652:         loTriFactor->csrMat->num_cols       = A->cmap->n;
653:         loTriFactor->csrMat->num_entries    = a->nz;
654:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
655:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
656:         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
657:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
658:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
659:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

661:         /* Create the solve analysis information */
662:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
663:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
664:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
665:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
666:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

668:         /* perform the solve analysis */
669:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
670:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

672:         PetscCallHIP(WaitForHIP());
673:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

675:         /* assign the pointer */
676:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

678:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
679:         PetscCallHIP(hipHostFree(AiUp));
680:         PetscCallHIP(hipHostFree(AjUp));
681:       } else {
682:         /* Fill the upper triangular matrix */
683:         offset = 0;
684:         for (i = 0; i < n; i++) {
685:           /* set the pointers */
686:           v  = aa + ai[i];
687:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

689:           /* first, set the diagonal elements */
690:           AAUp[offset] = 1.0 / v[nz];
691:           AALo[offset] = 1.0 / v[nz];

693:           offset += 1;
694:           if (nz > 0) {
695:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
696:             for (j = offset; j < offset + nz; j++) {
697:               AAUp[j] = -AAUp[j];
698:               AALo[j] = AAUp[j] / v[nz];
699:             }
700:             offset += nz;
701:           }
702:         }
703:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
705:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
706:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
707:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
708:       }
709:       PetscCallHIP(hipHostFree(AAUp));
710:       PetscCallHIP(hipHostFree(AALo));
711:     } catch (char *ex) {
712:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
713:     }
714:   }
715:   PetscFunctionReturn(PETSC_SUCCESS);
716: }

718: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
719: {
720:   PetscBool                      perm_identity;
721:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
722:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
723:   IS                             ip                  = a->row;
724:   PetscInt                       n                   = A->rmap->n;

726:   PetscFunctionBegin;
727:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
728:   PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
729:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
730:   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;

732:   A->offloadmask = PETSC_OFFLOAD_BOTH;
733:   /* lower triangular indices */
734:   PetscCall(ISIdentity(ip, &perm_identity));
735:   if (!perm_identity) {
736:     IS              iip;
737:     const PetscInt *irip, *rip;

739:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
740:     PetscCall(ISGetIndices(iip, &irip));
741:     PetscCall(ISGetIndices(ip, &rip));
742:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
743:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
744:     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
745:     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
746:     PetscCall(ISRestoreIndices(iip, &irip));
747:     PetscCall(ISDestroy(&iip));
748:     PetscCall(ISRestoreIndices(ip, &rip));
749:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
750:   }
751:   PetscFunctionReturn(PETSC_SUCCESS);
752: }

754: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
755: {
756:   PetscBool   perm_identity;
757:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
758:   IS          ip = b->row;

760:   PetscFunctionBegin;
761:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
762:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
763:   B->offloadmask = PETSC_OFFLOAD_CPU;
764:   /* determine which version of MatSolve needs to be used. */
765:   PetscCall(ISIdentity(ip, &perm_identity));
766:   if (perm_identity) {
767:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
768:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
769:     B->ops->matsolve          = NULL;
770:     B->ops->matsolvetranspose = NULL;
771:   } else {
772:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
773:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
774:     B->ops->matsolve          = NULL;
775:     B->ops->matsolvetranspose = NULL;
776:   }

778:   /* get the triangular factors */
779:   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
780:   PetscFunctionReturn(PETSC_SUCCESS);
781: }

783: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
784: {
785:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
786:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
787:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
788:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
789:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
790:   hipsparseIndexBase_t                indexBase;
791:   hipsparseMatrixType_t               matrixType;
792:   hipsparseFillMode_t                 fillMode;
793:   hipsparseDiagType_t                 diagType;

795:   PetscFunctionBegin;
796:   /* allocate space for the transpose of the lower triangular factor */
797:   PetscCall(PetscNew(&loTriFactorT));
798:   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

800:   /* set the matrix descriptors of the lower triangular factor */
801:   matrixType = hipsparseGetMatType(loTriFactor->descr);
802:   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
803:   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
804:   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);

806:   /* Create the matrix description */
807:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
808:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
809:   PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
810:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
811:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));

813:   /* set the operation */
814:   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

816:   /* allocate GPU space for the CSC of the lower triangular factor*/
817:   loTriFactorT->csrMat                 = new CsrMatrix;
818:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
819:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
820:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
821:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
822:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
823:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

825:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
826:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
827: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
828:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
829:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
830:                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
831:   PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
832: #endif
833: */
834:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

836:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
837:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
838: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
839:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
840:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
841: #else
842:                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
843: #endif

845:   PetscCallHIP(WaitForHIP());
846:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

848:   /* Create the solve analysis information */
849:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
850:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
851:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
852:                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
853:   PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));

855:   /* perform the solve analysis */
856:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
857:                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

859:   PetscCallHIP(WaitForHIP());
860:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

862:   /* assign the pointer */
863:   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

865:   /*********************************************/
866:   /* Now the Transpose of the Upper Tri Factor */
867:   /*********************************************/

869:   /* allocate space for the transpose of the upper triangular factor */
870:   PetscCall(PetscNew(&upTriFactorT));
871:   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

873:   /* set the matrix descriptors of the upper triangular factor */
874:   matrixType = hipsparseGetMatType(upTriFactor->descr);
875:   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
876:   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
877:   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);

879:   /* Create the matrix description */
880:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
881:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
882:   PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
883:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
884:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));

886:   /* set the operation */
887:   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

889:   /* allocate GPU space for the CSC of the upper triangular factor*/
890:   upTriFactorT->csrMat                 = new CsrMatrix;
891:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
892:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
893:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
894:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
895:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
896:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

898:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
899:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
900: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
901:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
902:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
903:                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
904:   PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
905: #endif
906: */
907:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
908:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
909:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
910: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
911:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
912:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
913: #else
914:                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
915: #endif

917:   PetscCallHIP(WaitForHIP());
918:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

920:   /* Create the solve analysis information */
921:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
922:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
923:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
924:                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
925:   PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));

927:   /* perform the solve analysis */
928:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
929:                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

931:   PetscCallHIP(WaitForHIP());
932:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

934:   /* assign the pointer */
935:   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
936:   PetscFunctionReturn(PETSC_SUCCESS);
937: }

939: struct PetscScalarToPetscInt {
940:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
941: };

943: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
944: {
945:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
946:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
947:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
948:   hipsparseIndexBase_t           indexBase;

950:   PetscFunctionBegin;
951:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
952:   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
953:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
954:   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
955:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
956:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
957:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
958:   PetscCall(PetscLogGpuTimeBegin());
959:   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
960:   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
961:     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
962:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
963:     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
964:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
965:     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

967:     /* set alpha and beta */
968:     PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
969:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
970:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
971:     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
972:     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
973:     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

975:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
976:       CsrMatrix *matrixT      = new CsrMatrix;
977:       matstructT->mat         = matrixT;
978:       matrixT->num_rows       = A->cmap->n;
979:       matrixT->num_cols       = A->rmap->n;
980:       matrixT->num_entries    = a->nz;
981:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
982:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
983:       matrixT->values         = new THRUSTARRAY(a->nz);

985:       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
986:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

988:       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
989:                                             indexBase, hipsparse_scalartype));
990:     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
991:       CsrMatrix *temp  = new CsrMatrix;
992:       CsrMatrix *tempT = new CsrMatrix;
993:       /* First convert HYB to CSR */
994:       temp->num_rows       = A->rmap->n;
995:       temp->num_cols       = A->cmap->n;
996:       temp->num_entries    = a->nz;
997:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
998:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
999:       temp->values         = new THRUSTARRAY(a->nz);

1001:       PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));

1003:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1004:       tempT->num_rows       = A->rmap->n;
1005:       tempT->num_cols       = A->cmap->n;
1006:       tempT->num_entries    = a->nz;
1007:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1008:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1009:       tempT->values         = new THRUSTARRAY(a->nz);

1011:       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1012:                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));

1014:       /* Last, convert CSC to HYB */
1015:       hipsparseHybMat_t hybMat;
1016:       PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1017:       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1018:       PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));

1020:       /* assign the pointer */
1021:       matstructT->mat = hybMat;
1022:       A->transupdated = PETSC_TRUE;
1023:       /* delete temporaries */
1024:       if (tempT) {
1025:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1026:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1027:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1028:         delete (CsrMatrix *)tempT;
1029:       }
1030:       if (temp) {
1031:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1032:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1033:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1034:         delete (CsrMatrix *)temp;
1035:       }
1036:     }
1037:   }
1038:   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1039:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1040:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1041:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1042:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1043:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1044:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1045:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1046:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1047:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1048:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1049:     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1050:       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1051:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1052:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1053:     }
1054:     if (!hipsparsestruct->csr2csc_i) {
1055:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1056:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1058:       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1059:       if (matrix->num_entries) {
1060:         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1061:            Need to verify this for ROCm.
1062:         */
1063:         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1064:                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1065:       } else {
1066:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1067:       }

1069:       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1070:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1071:     }
1072:     PetscCallThrust(
1073:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1074:   }
1075:   PetscCall(PetscLogGpuTimeEnd());
1076:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1077:   /* the compressed row indices is not used for matTranspose */
1078:   matstructT->cprowIndices = NULL;
1079:   /* assign the pointer */
1080:   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1081:   A->transupdated                                 = PETSC_TRUE;
1082:   PetscFunctionReturn(PETSC_SUCCESS);
1083: }

1085: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1086: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1087: {
1088:   PetscInt                              n = xx->map->n;
1089:   const PetscScalar                    *barray;
1090:   PetscScalar                          *xarray;
1091:   thrust::device_ptr<const PetscScalar> bGPU;
1092:   thrust::device_ptr<PetscScalar>       xGPU;
1093:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1094:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1095:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1096:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1098:   PetscFunctionBegin;
1099:   /* Analyze the matrix and create the transpose ... on the fly */
1100:   if (!loTriFactorT && !upTriFactorT) {
1101:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1102:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1103:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1104:   }

1106:   /* Get the GPU pointers */
1107:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1108:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1109:   xGPU = thrust::device_pointer_cast(xarray);
1110:   bGPU = thrust::device_pointer_cast(barray);

1112:   PetscCall(PetscLogGpuTimeBegin());
1113:   /* First, reorder with the row permutation */
1114:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);

1116:   /* First, solve U */
1117:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1118:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1120:   /* Then, solve L */
1121:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1122:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1124:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1125:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());

1127:   /* Copy the temporary to the full solution. */
1128:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);

1130:   /* restore */
1131:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1132:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1133:   PetscCall(PetscLogGpuTimeEnd());
1134:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1135:   PetscFunctionReturn(PETSC_SUCCESS);
1136: }

1138: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1139: {
1140:   const PetscScalar                  *barray;
1141:   PetscScalar                        *xarray;
1142:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1143:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1144:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1145:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1147:   PetscFunctionBegin;
1148:   /* Analyze the matrix and create the transpose ... on the fly */
1149:   if (!loTriFactorT && !upTriFactorT) {
1150:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1151:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1152:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1153:   }

1155:   /* Get the GPU pointers */
1156:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1157:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1159:   PetscCall(PetscLogGpuTimeBegin());
1160:   /* First, solve U */
1161:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1162:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1164:   /* Then, solve L */
1165:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1166:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1168:   /* restore */
1169:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1170:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1171:   PetscCall(PetscLogGpuTimeEnd());
1172:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1173:   PetscFunctionReturn(PETSC_SUCCESS);
1174: }

1176: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1177: {
1178:   const PetscScalar                    *barray;
1179:   PetscScalar                          *xarray;
1180:   thrust::device_ptr<const PetscScalar> bGPU;
1181:   thrust::device_ptr<PetscScalar>       xGPU;
1182:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1183:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1184:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1185:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1187:   PetscFunctionBegin;
1188:   /* Get the GPU pointers */
1189:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1190:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1191:   xGPU = thrust::device_pointer_cast(xarray);
1192:   bGPU = thrust::device_pointer_cast(barray);

1194:   PetscCall(PetscLogGpuTimeBegin());
1195:   /* First, reorder with the row permutation */
1196:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());

1198:   /* Next, solve L */
1199:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1200:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1202:   /* Then, solve U */
1203:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1204:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1206:   /* Last, reorder with the column permutation */
1207:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);

1209:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1210:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1211:   PetscCall(PetscLogGpuTimeEnd());
1212:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1213:   PetscFunctionReturn(PETSC_SUCCESS);
1214: }

1216: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1217: {
1218:   const PetscScalar                  *barray;
1219:   PetscScalar                        *xarray;
1220:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1221:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1222:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1223:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1225:   PetscFunctionBegin;
1226:   /* Get the GPU pointers */
1227:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1228:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1230:   PetscCall(PetscLogGpuTimeBegin());
1231:   /* First, solve L */
1232:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1233:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1235:   /* Next, solve U */
1236:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1237:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1239:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1240:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1241:   PetscCall(PetscLogGpuTimeEnd());
1242:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1243:   PetscFunctionReturn(PETSC_SUCCESS);
1244: }

1246: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1247: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1248: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1249: {
1250:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1251:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1252:   const PetscScalar             *barray;
1253:   PetscScalar                   *xarray;

1255:   PetscFunctionBegin;
1256:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1257:   PetscCall(VecHIPGetArrayRead(b, &barray));
1258:   PetscCall(PetscLogGpuTimeBegin());

1260:   /* Solve L*y = b */
1261:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1262:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1263:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1264:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                   /* L Y = X */
1265:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1266:   #else
1267:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1268:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1269:   #endif
1270:   /* Solve U*x = y */
1271:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1272:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1273:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1274:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1275:   #else
1276:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1277:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1278:   #endif
1279:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1280:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1282:   PetscCall(PetscLogGpuTimeEnd());
1283:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1284:   PetscFunctionReturn(PETSC_SUCCESS);
1285: }

1287: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1288: {
1289:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1290:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1291:   const PetscScalar             *barray;
1292:   PetscScalar                   *xarray;

1294:   PetscFunctionBegin;
1295:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1296:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1297:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1298:                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1300:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1301:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1302:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1303:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1304:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1305:   }

1307:   if (!fs->updatedTransposeSpSVAnalysis) {
1308:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1310:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1311:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1312:   }

1314:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1315:   PetscCall(VecHIPGetArrayRead(b, &barray));
1316:   PetscCall(PetscLogGpuTimeBegin());

1318:   /* Solve Ut*y = b */
1319:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1320:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1321:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1322:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1323:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1324:   #else
1325:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1326:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1327:   #endif
1328:   /* Solve Lt*x = y */
1329:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1330:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1331:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1332:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1333:   #else
1334:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1335:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1336:   #endif
1337:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1338:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1339:   PetscCall(PetscLogGpuTimeEnd());
1340:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1341:   PetscFunctionReturn(PETSC_SUCCESS);
1342: }

1344: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1345: {
1346:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1347:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1348:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1349:   CsrMatrix                     *Acsr;
1350:   PetscInt                       m, nz;
1351:   PetscBool                      flg;

1353:   PetscFunctionBegin;
1354:   if (PetscDefined(USE_DEBUG)) {
1355:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1356:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1357:   }

1359:   /* Copy A's value to fact */
1360:   m  = fact->rmap->n;
1361:   nz = aij->nz;
1362:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1363:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1364:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1366:   /* Factorize fact inplace */
1367:   if (m)
1368:     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1369:                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1370:   if (PetscDefined(USE_DEBUG)) {
1371:     int               numerical_zero;
1372:     hipsparseStatus_t status;
1373:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1374:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1375:   }

1377:   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1378:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1380:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1382:   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1383:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1385:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1386:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1387:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1388:   fact->ops->matsolve          = NULL;
1389:   fact->ops->matsolvetranspose = NULL;
1390:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1391:   PetscFunctionReturn(PETSC_SUCCESS);
1392: }

1394: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1395: {
1396:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1397:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1398:   PetscInt                       m, nz;

1400:   PetscFunctionBegin;
1401:   if (PetscDefined(USE_DEBUG)) {
1402:     PetscBool flg, diagDense;

1404:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1405:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1406:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1407:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1408:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1409:   }

1411:   /* Free the old stale stuff */
1412:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1414:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1415:      but they will not be used. Allocate them just for easy debugging.
1416:    */
1417:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1419:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1420:   fact->factortype             = MAT_FACTOR_ILU;
1421:   fact->info.factor_mallocs    = 0;
1422:   fact->info.fill_ratio_given  = info->fill;
1423:   fact->info.fill_ratio_needed = 1.0;

1425:   aij->row = NULL;
1426:   aij->col = NULL;

1428:   /* ====================================================================== */
1429:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1430:   /* We'll do in-place factorization on fact                                */
1431:   /* ====================================================================== */
1432:   const int *Ai, *Aj;

1434:   m  = fact->rmap->n;
1435:   nz = aij->nz;

1437:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1438:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1439:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1440:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1441:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1444:   /* ====================================================================== */
1445:   /* Create descriptors for M, L, U                                         */
1446:   /* ====================================================================== */
1447:   hipsparseFillMode_t fillMode;
1448:   hipsparseDiagType_t diagType;

1450:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1451:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1452:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1454:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1455:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1456:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1457:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1458:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1459:   */
1460:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1461:   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1462:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1463:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1464:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1466:   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1467:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1468:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1469:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1470:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1472:   /* ========================================================================= */
1473:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1474:   /* ========================================================================= */
1475:   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1476:   if (m)
1477:     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1478:                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1480:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1481:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1483:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1484:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1486:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1487:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1489:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1490:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1492:   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1493:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1494:    */
1495:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1496:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1497:     fs->spsvBuffer_L = fs->factBuffer_M;
1498:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1499:   } else {
1500:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1501:     fs->spsvBuffer_U = fs->factBuffer_M;
1502:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1503:   }

1505:   /* ========================================================================== */
1506:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1507:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1508:   /* ========================================================================== */
1509:   int structural_zero;

1511:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1512:   if (m)
1513:     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1514:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1515:   if (PetscDefined(USE_DEBUG)) {
1516:     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1517:     hipsparseStatus_t status;
1518:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1519:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1520:   }

1522:   /* Estimate FLOPs of the numeric factorization */
1523:   {
1524:     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
1525:     PetscInt       *Ai, nzRow, nzLeft;
1526:     PetscLogDouble  flops = 0.0;
1527:     const PetscInt *Adiag;

1529:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1530:     Ai = Aseq->i;
1531:     for (PetscInt i = 0; i < m; i++) {
1532:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1533:         nzRow  = Ai[i + 1] - Ai[i];
1534:         nzLeft = Adiag[i] - Ai[i];
1535:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1536:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1537:         */
1538:         nzLeft = (nzRow - 1) / 2;
1539:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1540:       }
1541:     }
1542:     fs->numericFactFlops = flops;
1543:   }
1544:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1545:   PetscFunctionReturn(PETSC_SUCCESS);
1546: }

1548: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1549: {
1550:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1551:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1552:   const PetscScalar             *barray;
1553:   PetscScalar                   *xarray;

1555:   PetscFunctionBegin;
1556:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1557:   PetscCall(VecHIPGetArrayRead(b, &barray));
1558:   PetscCall(PetscLogGpuTimeBegin());

1560:   /* Solve L*y = b */
1561:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1562:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1563:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1564:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1565:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1566:   #else
1567:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1568:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1569:   #endif
1570:   /* Solve Lt*x = y */
1571:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1572:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1573:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1574:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1575:   #else
1576:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1577:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1578:   #endif
1579:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1580:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1582:   PetscCall(PetscLogGpuTimeEnd());
1583:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1584:   PetscFunctionReturn(PETSC_SUCCESS);
1585: }

1587: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1588: {
1589:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1590:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1591:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1592:   CsrMatrix                     *Acsr;
1593:   PetscInt                       m, nz;
1594:   PetscBool                      flg;

1596:   PetscFunctionBegin;
1597:   if (PetscDefined(USE_DEBUG)) {
1598:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1599:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1600:   }

1602:   /* Copy A's value to fact */
1603:   m  = fact->rmap->n;
1604:   nz = aij->nz;
1605:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1606:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1607:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1609:   /* Factorize fact inplace */
1610:   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1611:      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1612:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1613:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1614:    */
1615:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1616:   if (PetscDefined(USE_DEBUG)) {
1617:     int               numerical_zero;
1618:     hipsparseStatus_t status;
1619:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1620:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1621:   }

1623:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1625:   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1626:     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1627:   */
1628:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1630:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1631:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632:   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1633:   fact->ops->matsolve          = NULL;
1634:   fact->ops->matsolvetranspose = NULL;
1635:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1636:   PetscFunctionReturn(PETSC_SUCCESS);
1637: }

1639: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1640: {
1641:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1642:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1643:   PetscInt                       m, nz;

1645:   PetscFunctionBegin;
1646:   if (PetscDefined(USE_DEBUG)) {
1647:     PetscBool flg, diagDense;

1649:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1650:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1651:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1652:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1653:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1654:   }

1656:   /* Free the old stale stuff */
1657:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1659:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1660:      but they will not be used. Allocate them just for easy debugging.
1661:    */
1662:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1664:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1665:   fact->factortype             = MAT_FACTOR_ICC;
1666:   fact->info.factor_mallocs    = 0;
1667:   fact->info.fill_ratio_given  = info->fill;
1668:   fact->info.fill_ratio_needed = 1.0;

1670:   aij->row = NULL;
1671:   aij->col = NULL;

1673:   /* ====================================================================== */
1674:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1675:   /* We'll do in-place factorization on fact                                */
1676:   /* ====================================================================== */
1677:   const int *Ai, *Aj;

1679:   m  = fact->rmap->n;
1680:   nz = aij->nz;

1682:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1683:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1684:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1685:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1686:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1689:   /* ====================================================================== */
1690:   /* Create mat descriptors for M, L                                        */
1691:   /* ====================================================================== */
1692:   hipsparseFillMode_t fillMode;
1693:   hipsparseDiagType_t diagType;

1695:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1696:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1697:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1699:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1700:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1701:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1702:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1703:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1704:   */
1705:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1706:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1707:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1708:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1709:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1711:   /* ========================================================================= */
1712:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1713:   /* ========================================================================= */
1714:   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1715:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));

1717:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1718:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1720:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1721:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1723:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1724:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1726:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1727:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1729:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1730:      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1731:    */
1732:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1733:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1734:     fs->spsvBuffer_L = fs->factBuffer_M;
1735:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1736:   } else {
1737:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1738:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1739:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1740:   }

1742:   /* ========================================================================== */
1743:   /* Perform analysis of ic0 on M                                               */
1744:   /* The lower triangular part of M has the same sparsity pattern as L          */
1745:   /* ========================================================================== */
1746:   int structural_zero;

1748:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1749:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1750:   if (PetscDefined(USE_DEBUG)) {
1751:     hipsparseStatus_t status;
1752:     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1753:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1754:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1755:   }

1757:   /* Estimate FLOPs of the numeric factorization */
1758:   {
1759:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1760:     PetscInt      *Ai, nzRow, nzLeft;
1761:     PetscLogDouble flops = 0.0;

1763:     Ai = Aseq->i;
1764:     for (PetscInt i = 0; i < m; i++) {
1765:       nzRow = Ai[i + 1] - Ai[i];
1766:       if (nzRow > 1) {
1767:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1768:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1769:         */
1770:         nzLeft = (nzRow - 1) / 2;
1771:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1772:       }
1773:     }
1774:     fs->numericFactFlops = flops;
1775:   }
1776:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1777:   PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1779: #endif

1781: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1782: {
1783:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1785:   PetscFunctionBegin;
1786: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1787:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1788:   if (!info->factoronhost) {
1789:     PetscCall(ISIdentity(isrow, &row_identity));
1790:     PetscCall(ISIdentity(iscol, &col_identity));
1791:   }
1792:   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1793:   else
1794: #endif
1795:   {
1796:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1797:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1798:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1799:   }
1800:   PetscFunctionReturn(PETSC_SUCCESS);
1801: }

1803: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1804: {
1805:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1807:   PetscFunctionBegin;
1808:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1809:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1810:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1811:   PetscFunctionReturn(PETSC_SUCCESS);
1812: }

1814: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1815: {
1816:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1818:   PetscFunctionBegin;
1819: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1820:   PetscBool perm_identity = PETSC_FALSE;
1821:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1822:   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1823:   else
1824: #endif
1825:   {
1826:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1827:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1828:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1829:   }
1830:   PetscFunctionReturn(PETSC_SUCCESS);
1831: }

1833: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1834: {
1835:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1837:   PetscFunctionBegin;
1838:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1839:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1840:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1841:   PetscFunctionReturn(PETSC_SUCCESS);
1842: }

1844: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1845: {
1846:   PetscFunctionBegin;
1847:   *type = MATSOLVERHIPSPARSE;
1848:   PetscFunctionReturn(PETSC_SUCCESS);
1849: }

1851: /*MC
1852:   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1853:   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1854:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1855:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1856:   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1857:   algorithms are not recommended. This class does NOT support direct solver operations.

1859:   Level: beginner

1861: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1862: M*/

1864: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1865: {
1866:   PetscInt n = A->rmap->n;

1868:   PetscFunctionBegin;
1869:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1870:   PetscCall(MatSetSizes(*B, n, n, n, n));
1871:   (*B)->factortype = ftype;
1872:   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));

1874:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1875:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1876:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1877:     if (!A->boundtocpu) {
1878:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1879:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1880:     } else {
1881:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1882:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1883:     }
1884:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1885:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1886:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1887:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1888:     if (!A->boundtocpu) {
1889:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1890:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1891:     } else {
1892:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1893:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1894:     }
1895:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1896:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1897:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");

1899:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1900:   (*B)->canuseordering = PETSC_TRUE;
1901:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1902:   PetscFunctionReturn(PETSC_SUCCESS);
1903: }

1905: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1906: {
1907:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1908:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1909: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1910:   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1911: #endif

1913:   PetscFunctionBegin;
1914:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1915:     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1916:     if (A->factortype == MAT_FACTOR_NONE) {
1917:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1918:       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1919:     }
1920: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1921:     else if (fs->csrVal) {
1922:       /* We have a factorized matrix on device and are able to copy it to host */
1923:       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1924:     }
1925: #endif
1926:     else
1927:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1928:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1929:     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1930:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1931:   }
1932:   PetscFunctionReturn(PETSC_SUCCESS);
1933: }

1935: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (HIP specialisation) */
1936: struct MatSeqAIJHIPSPARSE_Policy {
1937:   typedef Mat_SeqAIJHIPSPARSE           mat_struct_type;
1938:   typedef Mat_SeqAIJHIPSPARSEMultStruct mult_struct_type;

1940:   static int storage_format_csr() { return (int)MAT_HIPSPARSE_CSR; }
1941:   static int storage_format_ell() { return (int)MAT_HIPSPARSE_ELL; }
1942:   static int storage_format_hyb() { return (int)MAT_HIPSPARSE_HYB; }

1944:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJHIPSPARSECopyToGPU(A); }
1945:   static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJHIPSPARSECopyFromGPU(A); }
1946:   static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJHIPSPARSEInvalidateTranspose(A, d); }
1947:   static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, t, r, C); }
1948:   static const char    *mat_type_name;

1950:   static PetscErrorCode Destroy(Mat A) { return MatSeqAIJHIPSPARSE_Destroy(A); }
1951:   static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)spptr); }
1952:   static const char    *set_format_c;
1953:   static const char    *set_use_cpu_solve_c;
1954:   static const char    *product_seqdense_device_c;
1955:   static const char    *product_seqdense_c;
1956:   static const char    *product_self_c;
1957:   static const char    *seq_convert_hypre_c;

1959:   static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecHIPGetArrayRead(v, a); }
1960:   static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecHIPRestoreArrayRead(v, a); }
1961:   static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecHIPGetArrayWrite(v, a); }
1962:   static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecHIPRestoreArrayWrite(v, a); }
1963: };
1964: const char *MatSeqAIJHIPSPARSE_Policy::mat_type_name             = MATSEQAIJHIPSPARSE;
1965: const char *MatSeqAIJHIPSPARSE_Policy::set_format_c              = "MatHIPSPARSESetFormat_C";
1966: const char *MatSeqAIJHIPSPARSE_Policy::set_use_cpu_solve_c       = "MatHIPSPARSESetUseCPUSolve_C";
1967: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C";
1968: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_c        = "MatProductSetFromOptions_seqaijhipsparse_seqdense_C";
1969: const char *MatSeqAIJHIPSPARSE_Policy::product_self_c            = "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C";
1970: const char *MatSeqAIJHIPSPARSE_Policy::seq_convert_hypre_c       = "MatConvert_seqaijhipsparse_hypre_C";

1972: using MatSeqAIJHIPSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::HIP, MatSeqAIJHIPSPARSE_Policy>;

1974: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1975: {
1976:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1977: }

1979: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1980: {
1981:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1982: }

1984: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1985: {
1986:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1987: }

1989: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1990: {
1991:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1992: }

1994: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1995: {
1996:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1997: }

1999: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
2000: {
2001:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
2002: }

2004: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2005: {
2006:   Mat_SeqAIJHIPSPARSE *cusp;
2007:   CsrMatrix           *matrix;

2009:   PetscFunctionBegin;
2010:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2011:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2012:   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
2013:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2014:   matrix = (CsrMatrix *)cusp->mat->mat;

2016:   if (i) {
2017: #if !defined(PETSC_USE_64BIT_INDICES)
2018:     *i = matrix->row_offsets->data().get();
2019: #else
2020:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2021: #endif
2022:   }
2023:   if (j) {
2024: #if !defined(PETSC_USE_64BIT_INDICES)
2025:     *j = matrix->column_indices->data().get();
2026: #else
2027:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2028: #endif
2029:   }
2030:   if (a) *a = matrix->values->data().get();
2031:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2032:   PetscFunctionReturn(PETSC_SUCCESS);
2033: }

2035: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2036: {
2037:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2038:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
2039:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2040:   PetscBool                      both            = PETSC_TRUE;
2041:   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;

2043:   PetscFunctionBegin;
2044:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2045:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2046:     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2047:       CsrMatrix *matrix;
2048:       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;

2050:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2051:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2052:       matrix->values->assign(a->a, a->a + a->nz);
2053:       PetscCallHIP(WaitForHIP());
2054:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2055:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2056:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2057:     } else {
2058:       PetscInt nnz;
2059:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2060:       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2061:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2062:       delete hipsparsestruct->workVector;
2063:       delete hipsparsestruct->rowoffsets_gpu;
2064:       hipsparsestruct->workVector     = NULL;
2065:       hipsparsestruct->rowoffsets_gpu = NULL;
2066:       try {
2067:         if (a->compressedrow.use) {
2068:           m    = a->compressedrow.nrows;
2069:           ii   = a->compressedrow.i;
2070:           ridx = a->compressedrow.rindex;
2071:         } else {
2072:           m    = A->rmap->n;
2073:           ii   = a->i;
2074:           ridx = NULL;
2075:         }
2076:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2077:         if (!a->a) {
2078:           nnz  = ii[m];
2079:           both = PETSC_FALSE;
2080:         } else nnz = a->nz;
2081:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2083:         /* create hipsparse matrix */
2084:         hipsparsestruct->nrows = m;
2085:         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2086:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2087:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2088:         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

2090:         PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2091:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2092:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2093:         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2094:         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2095:         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2096:         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));

2098:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2099:         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2100:           /* set the matrix */
2101:           CsrMatrix *mat      = new CsrMatrix;
2102:           mat->num_rows       = m;
2103:           mat->num_cols       = A->cmap->n;
2104:           mat->num_entries    = nnz;
2105:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2106:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2107:           mat->values         = new THRUSTARRAY(nnz);
2108:           mat->row_offsets->assign(ii, ii + m + 1);
2109:           mat->column_indices->assign(a->j, a->j + nnz);
2110:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2112:           /* assign the pointer */
2113:           matstruct->mat = mat;
2114:           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2115:             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2116:                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2117:           }
2118:         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2119:           CsrMatrix *mat      = new CsrMatrix;
2120:           mat->num_rows       = m;
2121:           mat->num_cols       = A->cmap->n;
2122:           mat->num_entries    = nnz;
2123:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2124:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2125:           mat->values         = new THRUSTARRAY(nnz);
2126:           mat->row_offsets->assign(ii, ii + m + 1);
2127:           mat->column_indices->assign(a->j, a->j + nnz);
2128:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2130:           hipsparseHybMat_t hybMat;
2131:           PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2132:           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2133:           PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2134:           /* assign the pointer */
2135:           matstruct->mat = hybMat;

2137:           if (mat) {
2138:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2139:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2140:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2141:             delete (CsrMatrix *)mat;
2142:           }
2143:         }

2145:         /* assign the compressed row indices */
2146:         if (a->compressedrow.use) {
2147:           hipsparsestruct->workVector = new THRUSTARRAY(m);
2148:           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2149:           matstruct->cprowIndices->assign(ridx, ridx + m);
2150:           tmp = m;
2151:         } else {
2152:           hipsparsestruct->workVector = NULL;
2153:           matstruct->cprowIndices     = NULL;
2154:           tmp                         = 0;
2155:         }
2156:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2158:         /* assign the pointer */
2159:         hipsparsestruct->mat = matstruct;
2160:       } catch (char *ex) {
2161:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2162:       }
2163:       PetscCallHIP(WaitForHIP());
2164:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2165:       hipsparsestruct->nonzerostate = A->nonzerostate;
2166:     }
2167:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2168:   }
2169:   PetscFunctionReturn(PETSC_SUCCESS);
2170: }

2172: struct VecHIPPlusEquals {
2173:   template <typename Tuple>
2174:   __host__ __device__ void operator()(Tuple t)
2175:   {
2176:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2177:   }
2178: };

2180: struct VecHIPEquals {
2181:   template <typename Tuple>
2182:   __host__ __device__ void operator()(Tuple t)
2183:   {
2184:     thrust::get<1>(t) = thrust::get<0>(t);
2185:   }
2186: };

2188: struct VecHIPEqualsReverse {
2189:   template <typename Tuple>
2190:   __host__ __device__ void operator()(Tuple t)
2191:   {
2192:     thrust::get<0>(t) = thrust::get<1>(t);
2193:   }
2194: };

2196: struct MatProductCtx_MatMatHipsparse {
2197:   PetscBool             cisdense;
2198:   PetscScalar          *Bt;
2199:   Mat                   X;
2200:   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2201:   PetscLogDouble        flops;
2202:   CsrMatrix            *Bcsr;
2203:   hipsparseSpMatDescr_t matSpBDescr;
2204:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2205:   hipsparseDnMatDescr_t matBDescr;
2206:   hipsparseDnMatDescr_t matCDescr;
2207:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2208: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2209:   void *dBuffer4, *dBuffer5;
2210: #endif
2211:   size_t                 mmBufferSize;
2212:   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2213:   hipsparseSpGEMMDescr_t spgemmDesc;
2214: };

2216: static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2217: {
2218:   MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;

2220:   PetscFunctionBegin;
2221:   PetscCallHIP(hipFree(mmdata->Bt));
2222:   delete mmdata->Bcsr;
2223:   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2224:   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2225:   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2226:   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2227: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2228:   PetscCallHIP(hipFree(mmdata->dBuffer4));
2229:   PetscCallHIP(hipFree(mmdata->dBuffer5));
2230: #endif
2231:   PetscCallHIP(hipFree(mmdata->mmBuffer));
2232:   PetscCallHIP(hipFree(mmdata->mmBuffer2));
2233:   PetscCall(MatDestroy(&mmdata->X));
2234:   PetscCall(PetscFree(*(void **)data));
2235:   PetscFunctionReturn(PETSC_SUCCESS);
2236: }

2238: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2239: {
2240:   Mat_Product                   *product = C->product;
2241:   Mat                            A, B;
2242:   PetscInt                       m, n, blda, clda;
2243:   PetscBool                      flg, biship;
2244:   Mat_SeqAIJHIPSPARSE           *cusp;
2245:   hipsparseOperation_t           opA;
2246:   const PetscScalar             *barray;
2247:   PetscScalar                   *carray;
2248:   MatProductCtx_MatMatHipsparse *mmdata;
2249:   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2250:   CsrMatrix                     *csrmat;

2252:   PetscFunctionBegin;
2253:   MatCheckProduct(C, 1);
2254:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2255:   mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2256:   A      = product->A;
2257:   B      = product->B;
2258:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2259:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2260:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2261:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2262:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2263:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2264:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2265:   switch (product->type) {
2266:   case MATPRODUCT_AB:
2267:   case MATPRODUCT_PtAP:
2268:     mat = cusp->mat;
2269:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2270:     m   = A->rmap->n;
2271:     n   = B->cmap->n;
2272:     break;
2273:   case MATPRODUCT_AtB:
2274:     if (!A->form_explicit_transpose) {
2275:       mat = cusp->mat;
2276:       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2277:     } else {
2278:       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2279:       mat = cusp->matTranspose;
2280:       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2281:     }
2282:     m = A->cmap->n;
2283:     n = B->cmap->n;
2284:     break;
2285:   case MATPRODUCT_ABt:
2286:   case MATPRODUCT_RARt:
2287:     mat = cusp->mat;
2288:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2289:     m   = A->rmap->n;
2290:     n   = B->rmap->n;
2291:     break;
2292:   default:
2293:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2294:   }
2295:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2296:   csrmat = (CsrMatrix *)mat->mat;
2297:   /* if the user passed a CPU matrix, copy the data to the GPU */
2298:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2299:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2300:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2301:   PetscCall(MatDenseGetLDA(B, &blda));
2302:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2303:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2304:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2305:   } else {
2306:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2307:     PetscCall(MatDenseGetLDA(C, &clda));
2308:   }

2310:   PetscCall(PetscLogGpuTimeBegin());
2311:   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2312:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2313:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2314:     size_t mmBufferSize;
2315:     if (mmdata->initialized && mmdata->Blda != blda) {
2316:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2317:       mmdata->matBDescr = NULL;
2318:     }
2319:     if (!mmdata->matBDescr) {
2320:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2321:       mmdata->Blda = blda;
2322:     }
2323:     if (mmdata->initialized && mmdata->Clda != clda) {
2324:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2325:       mmdata->matCDescr = NULL;
2326:     }
2327:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2328:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2329:       mmdata->Clda = clda;
2330:     }
2331:     if (!mat->matDescr) {
2332:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2333:                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2334:     }
2335:     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2336:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2337:       PetscCallHIP(hipFree(mmdata->mmBuffer));
2338:       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2339:       mmdata->mmBufferSize = mmBufferSize;
2340:     }
2341:     mmdata->initialized = PETSC_TRUE;
2342:   } else {
2343:     /* to be safe, always update pointers of the mats */
2344:     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2345:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2346:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2347:   }

2349:   /* do hipsparseSpMM, which supports transpose on B */
2350:   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

2352:   PetscCall(PetscLogGpuTimeEnd());
2353:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2354:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2355:   if (product->type == MATPRODUCT_RARt) {
2356:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2357:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2358:   } else if (product->type == MATPRODUCT_PtAP) {
2359:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2360:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2361:   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2362:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2363:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2364:   PetscFunctionReturn(PETSC_SUCCESS);
2365: }

2367: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2368: {
2369:   Mat_Product                   *product = C->product;
2370:   Mat                            A, B;
2371:   PetscInt                       m, n;
2372:   PetscBool                      cisdense, flg;
2373:   MatProductCtx_MatMatHipsparse *mmdata;
2374:   Mat_SeqAIJHIPSPARSE           *cusp;

2376:   PetscFunctionBegin;
2377:   MatCheckProduct(C, 1);
2378:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2379:   A = product->A;
2380:   B = product->B;
2381:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2382:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2383:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2384:   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2385:   switch (product->type) {
2386:   case MATPRODUCT_AB:
2387:     m = A->rmap->n;
2388:     n = B->cmap->n;
2389:     break;
2390:   case MATPRODUCT_AtB:
2391:     m = A->cmap->n;
2392:     n = B->cmap->n;
2393:     break;
2394:   case MATPRODUCT_ABt:
2395:     m = A->rmap->n;
2396:     n = B->rmap->n;
2397:     break;
2398:   case MATPRODUCT_PtAP:
2399:     m = B->cmap->n;
2400:     n = B->cmap->n;
2401:     break;
2402:   case MATPRODUCT_RARt:
2403:     m = B->rmap->n;
2404:     n = B->rmap->n;
2405:     break;
2406:   default:
2407:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2408:   }
2409:   PetscCall(MatSetSizes(C, m, n, m, n));
2410:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2411:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2412:   PetscCall(MatSetType(C, MATSEQDENSEHIP));

2414:   /* product data */
2415:   PetscCall(PetscNew(&mmdata));
2416:   mmdata->cisdense = cisdense;
2417:   /* for these products we need intermediate storage */
2418:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2419:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2420:     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2421:     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2422:     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2423:     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2424:   }
2425:   C->product->data       = mmdata;
2426:   C->product->destroy    = MatProductCtxDestroy_MatMatHipsparse;
2427:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2428:   PetscFunctionReturn(PETSC_SUCCESS);
2429: }

2431: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2432: {
2433:   Mat_Product                   *product = C->product;
2434:   Mat                            A, B;
2435:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2436:   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2437:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2438:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2439:   PetscBool                      flg;
2440:   MatProductType                 ptype;
2441:   MatProductCtx_MatMatHipsparse *mmdata;
2442:   hipsparseSpMatDescr_t          BmatSpDescr;
2443:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2445:   PetscFunctionBegin;
2446:   MatCheckProduct(C, 1);
2447:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2448:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2449:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2450:   mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2451:   A      = product->A;
2452:   B      = product->B;
2453:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2454:     mmdata->reusesym = PETSC_FALSE;
2455:     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2456:     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2457:     Cmat = Ccusp->mat;
2458:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2459:     Ccsr = (CsrMatrix *)Cmat->mat;
2460:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2461:     goto finalize;
2462:   }
2463:   if (!c->nz) goto finalize;
2464:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2465:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2466:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2467:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2468:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2469:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2470:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2471:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2472:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2473:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2474:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2475:   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2476:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2477:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));

2479:   ptype = product->type;
2480:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2481:     ptype = MATPRODUCT_AB;
2482:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2483:   }
2484:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2485:     ptype = MATPRODUCT_AB;
2486:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2487:   }
2488:   switch (ptype) {
2489:   case MATPRODUCT_AB:
2490:     Amat = Acusp->mat;
2491:     Bmat = Bcusp->mat;
2492:     break;
2493:   case MATPRODUCT_AtB:
2494:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2495:     Amat = Acusp->matTranspose;
2496:     Bmat = Bcusp->mat;
2497:     break;
2498:   case MATPRODUCT_ABt:
2499:     Amat = Acusp->mat;
2500:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2501:     Bmat = Bcusp->matTranspose;
2502:     break;
2503:   default:
2504:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2505:   }
2506:   Cmat = Ccusp->mat;
2507:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2508:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2509:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2510:   Acsr = (CsrMatrix *)Amat->mat;
2511:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2512:   Ccsr = (CsrMatrix *)Cmat->mat;
2513:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2514:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2515:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2516:   PetscCall(PetscLogGpuTimeBegin());
2517: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2518:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2519:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2520:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2521:   PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2522:   #else
2523:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2524:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2525:   #endif
2526: #else
2527:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2528:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2529:                                           Ccsr->column_indices->data().get()));
2530: #endif
2531:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2532:   PetscCallHIP(WaitForHIP());
2533:   PetscCall(PetscLogGpuTimeEnd());
2534:   C->offloadmask = PETSC_OFFLOAD_GPU;
2535: finalize:
2536:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2537:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2538:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2539:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2540:   c->reallocs = 0;
2541:   C->info.mallocs += 0;
2542:   C->info.nz_unneeded = 0;
2543:   C->assembled = C->was_assembled = PETSC_TRUE;
2544:   C->num_ass++;
2545:   PetscFunctionReturn(PETSC_SUCCESS);
2546: }

2548: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2549: {
2550:   Mat_Product                   *product = C->product;
2551:   Mat                            A, B;
2552:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2553:   Mat_SeqAIJ                    *a, *b, *c;
2554:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2555:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2556:   PetscInt                       i, j, m, n, k;
2557:   PetscBool                      flg;
2558:   MatProductType                 ptype;
2559:   MatProductCtx_MatMatHipsparse *mmdata;
2560:   PetscLogDouble                 flops;
2561:   PetscBool                      biscompressed, ciscompressed;
2562: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2563:   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2564:   hipsparseSpMatDescr_t BmatSpDescr;
2565: #else
2566:   int cnz;
2567: #endif
2568:   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2570:   PetscFunctionBegin;
2571:   MatCheckProduct(C, 1);
2572:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2573:   A = product->A;
2574:   B = product->B;
2575:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2576:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2577:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2578:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2579:   a = (Mat_SeqAIJ *)A->data;
2580:   b = (Mat_SeqAIJ *)B->data;
2581:   /* product data */
2582:   PetscCall(PetscNew(&mmdata));
2583:   C->product->data    = mmdata;
2584:   C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;

2586:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2587:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2588:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2589:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2590:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2591:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");

2593:   ptype = product->type;
2594:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2595:     ptype                                          = MATPRODUCT_AB;
2596:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2597:   }
2598:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2599:     ptype                                          = MATPRODUCT_AB;
2600:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2601:   }
2602:   biscompressed = PETSC_FALSE;
2603:   ciscompressed = PETSC_FALSE;
2604:   switch (ptype) {
2605:   case MATPRODUCT_AB:
2606:     m    = A->rmap->n;
2607:     n    = B->cmap->n;
2608:     k    = A->cmap->n;
2609:     Amat = Acusp->mat;
2610:     Bmat = Bcusp->mat;
2611:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2612:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2613:     break;
2614:   case MATPRODUCT_AtB:
2615:     m = A->cmap->n;
2616:     n = B->cmap->n;
2617:     k = A->rmap->n;
2618:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2619:     Amat = Acusp->matTranspose;
2620:     Bmat = Bcusp->mat;
2621:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2622:     break;
2623:   case MATPRODUCT_ABt:
2624:     m = A->rmap->n;
2625:     n = B->rmap->n;
2626:     k = A->cmap->n;
2627:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2628:     Amat = Acusp->mat;
2629:     Bmat = Bcusp->matTranspose;
2630:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2631:     break;
2632:   default:
2633:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2634:   }

2636:   /* create hipsparse matrix */
2637:   PetscCall(MatSetSizes(C, m, n, m, n));
2638:   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2639:   c     = (Mat_SeqAIJ *)C->data;
2640:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2641:   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2642:   Ccsr  = new CsrMatrix;

2644:   c->compressedrow.use = ciscompressed;
2645:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2646:     c->compressedrow.nrows = a->compressedrow.nrows;
2647:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2648:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2649:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2650:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2651:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2652:   } else {
2653:     c->compressedrow.nrows  = 0;
2654:     c->compressedrow.i      = NULL;
2655:     c->compressedrow.rindex = NULL;
2656:     Ccusp->workVector       = NULL;
2657:     Cmat->cprowIndices      = NULL;
2658:   }
2659:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2660:   Ccusp->mat        = Cmat;
2661:   Ccusp->mat->mat   = Ccsr;
2662:   Ccsr->num_rows    = Ccusp->nrows;
2663:   Ccsr->num_cols    = n;
2664:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2665:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2666:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2667:   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2668:   PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2669:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2670:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2671:   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2672:   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2673:   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2674:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2675:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2676:     c->nz                = 0;
2677:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2678:     Ccsr->values         = new THRUSTARRAY(c->nz);
2679:     goto finalizesym;
2680:   }

2682:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2683:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2684:   Acsr = (CsrMatrix *)Amat->mat;
2685:   if (!biscompressed) {
2686:     Bcsr        = (CsrMatrix *)Bmat->mat;
2687:     BmatSpDescr = Bmat->matDescr;
2688:   } else { /* we need to use row offsets for the full matrix */
2689:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2690:     Bcsr                 = new CsrMatrix;
2691:     Bcsr->num_rows       = B->rmap->n;
2692:     Bcsr->num_cols       = cBcsr->num_cols;
2693:     Bcsr->num_entries    = cBcsr->num_entries;
2694:     Bcsr->column_indices = cBcsr->column_indices;
2695:     Bcsr->values         = cBcsr->values;
2696:     if (!Bcusp->rowoffsets_gpu) {
2697:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2698:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2699:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2700:     }
2701:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2702:     mmdata->Bcsr      = Bcsr;
2703:     if (Bcsr->num_rows && Bcsr->num_cols) {
2704:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2705:     }
2706:     BmatSpDescr = mmdata->matSpBDescr;
2707:   }
2708:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2709:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2710:   /* precompute flops count */
2711:   if (ptype == MATPRODUCT_AB) {
2712:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2713:       const PetscInt st = a->i[i];
2714:       const PetscInt en = a->i[i + 1];
2715:       for (j = st; j < en; j++) {
2716:         const PetscInt brow = a->j[j];
2717:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2718:       }
2719:     }
2720:   } else if (ptype == MATPRODUCT_AtB) {
2721:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2722:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2723:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2724:       flops += (2. * anzi) * bnzi;
2725:     }
2726:   } else flops = 0.; /* TODO */

2728:   mmdata->flops = flops;
2729:   PetscCall(PetscLogGpuTimeBegin());
2730: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2731:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2732:   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2733:   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2734:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2735:   {
2736:     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2737:      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2738:   */
2739:     void *dBuffer1 = NULL;
2740:     void *dBuffer2 = NULL;
2741:     void *dBuffer3 = NULL;
2742:     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2743:     size_t bufferSize1 = 0;
2744:     size_t bufferSize2 = 0;
2745:     size_t bufferSize3 = 0;
2746:     size_t bufferSize4 = 0;
2747:     size_t bufferSize5 = 0;

2749:     /* ask bufferSize1 bytes for external memory */
2750:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2751:     PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2752:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2753:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));

2755:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2756:     PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2757:     PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2758:     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2759:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2760:     PetscCallHIP(hipFree(dBuffer1));
2761:     PetscCallHIP(hipFree(dBuffer2));

2763:     /* get matrix C non-zero entries C_nnz1 */
2764:     PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2765:     c->nz = (PetscInt)C_nnz1;
2766:     /* allocate matrix C */
2767:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2768:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2769:     Ccsr->values = new THRUSTARRAY(c->nz);
2770:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2771:     /* update matC with the new pointers */
2772:     if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2773:       PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));

2775:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2776:       PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2777:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2778:       PetscCallHIP(hipFree(dBuffer3));
2779:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2780:     }
2781:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2782:   }
2783:   #else
2784:   size_t bufSize2;
2785:   /* ask bufferSize bytes for external memory */
2786:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2787:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2788:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2789:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2790:   /* ask bufferSize again bytes for external memory */
2791:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2792:   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2793:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2794:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2795:      is stored in the descriptor! What a messy API... */
2796:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2797:   /* compute the intermediate product of A * B */
2798:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2799:   /* get matrix C non-zero entries C_nnz1 */
2800:   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2801:   c->nz = (PetscInt)C_nnz1;
2802:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2803:                       mmdata->mmBufferSize / 1024));
2804:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2805:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2806:   Ccsr->values = new THRUSTARRAY(c->nz);
2807:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2808:   PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2809:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2810:   #endif
2811: #else
2812:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2813:   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2814:                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2815:   c->nz                = cnz;
2816:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2817:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2818:   Ccsr->values = new THRUSTARRAY(c->nz);
2819:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */

2821:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2822:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2823:       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2824:       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2825:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2826:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2827:                                           Ccsr->column_indices->data().get()));
2828: #endif
2829:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2830:   PetscCall(PetscLogGpuTimeEnd());
2831: finalizesym:
2832:   c->free_a = PETSC_TRUE;
2833:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2834:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2835:   c->free_ij = PETSC_TRUE;
2836:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2837:     PetscInt      *d_i = c->i;
2838:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2839:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2840:     ii = *Ccsr->row_offsets;
2841:     jj = *Ccsr->column_indices;
2842:     if (ciscompressed) d_i = c->compressedrow.i;
2843:     PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2844:     PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2845:   } else {
2846:     PetscInt *d_i = c->i;
2847:     if (ciscompressed) d_i = c->compressedrow.i;
2848:     PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2849:     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2850:   }
2851:   if (ciscompressed) { /* need to expand host row offsets */
2852:     PetscInt r = 0;
2853:     c->i[0]    = 0;
2854:     for (k = 0; k < c->compressedrow.nrows; k++) {
2855:       const PetscInt next = c->compressedrow.rindex[k];
2856:       const PetscInt old  = c->compressedrow.i[k];
2857:       for (; r < next; r++) c->i[r + 1] = old;
2858:     }
2859:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2860:   }
2861:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2862:   PetscCall(PetscMalloc1(m, &c->ilen));
2863:   PetscCall(PetscMalloc1(m, &c->imax));
2864:   c->maxnz         = c->nz;
2865:   c->nonzerorowcnt = 0;
2866:   c->rmax          = 0;
2867:   for (k = 0; k < m; k++) {
2868:     const PetscInt nn = c->i[k + 1] - c->i[k];
2869:     c->ilen[k] = c->imax[k] = nn;
2870:     c->nonzerorowcnt += (PetscInt)!!nn;
2871:     c->rmax = PetscMax(c->rmax, nn);
2872:   }
2873:   PetscCall(PetscMalloc1(c->nz, &c->a));
2874:   Ccsr->num_entries = c->nz;

2876:   C->nonzerostate++;
2877:   PetscCall(PetscLayoutSetUp(C->rmap));
2878:   PetscCall(PetscLayoutSetUp(C->cmap));
2879:   Ccusp->nonzerostate = C->nonzerostate;
2880:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2881:   C->preallocated     = PETSC_TRUE;
2882:   C->assembled        = PETSC_FALSE;
2883:   C->was_assembled    = PETSC_FALSE;
2884:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2885:     mmdata->reusesym = PETSC_TRUE;
2886:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2887:   }
2888:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2889:   PetscFunctionReturn(PETSC_SUCCESS);
2890: }

2892: /* handles sparse or dense B */
2893: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2894: {
2895:   Mat_Product *product = mat->product;
2896:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2898:   PetscFunctionBegin;
2899:   MatCheckProduct(mat, 1);
2900:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2901:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2902:   if (product->type == MATPRODUCT_ABC) {
2903:     Ciscusp = PETSC_FALSE;
2904:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2905:   }
2906:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2907:     PetscBool usecpu = PETSC_FALSE;
2908:     switch (product->type) {
2909:     case MATPRODUCT_AB:
2910:       if (product->api_user) {
2911:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2912:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2913:         PetscOptionsEnd();
2914:       } else {
2915:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2916:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2917:         PetscOptionsEnd();
2918:       }
2919:       break;
2920:     case MATPRODUCT_AtB:
2921:       if (product->api_user) {
2922:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2923:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2924:         PetscOptionsEnd();
2925:       } else {
2926:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2927:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2928:         PetscOptionsEnd();
2929:       }
2930:       break;
2931:     case MATPRODUCT_PtAP:
2932:       if (product->api_user) {
2933:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2934:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2935:         PetscOptionsEnd();
2936:       } else {
2937:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2938:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2939:         PetscOptionsEnd();
2940:       }
2941:       break;
2942:     case MATPRODUCT_RARt:
2943:       if (product->api_user) {
2944:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2945:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2946:         PetscOptionsEnd();
2947:       } else {
2948:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2949:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2950:         PetscOptionsEnd();
2951:       }
2952:       break;
2953:     case MATPRODUCT_ABC:
2954:       if (product->api_user) {
2955:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2956:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2957:         PetscOptionsEnd();
2958:       } else {
2959:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2960:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2961:         PetscOptionsEnd();
2962:       }
2963:       break;
2964:     default:
2965:       break;
2966:     }
2967:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2968:   }
2969:   /* dispatch */
2970:   if (isdense) {
2971:     switch (product->type) {
2972:     case MATPRODUCT_AB:
2973:     case MATPRODUCT_AtB:
2974:     case MATPRODUCT_ABt:
2975:     case MATPRODUCT_PtAP:
2976:     case MATPRODUCT_RARt:
2977:       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2978:       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2979:       break;
2980:     case MATPRODUCT_ABC:
2981:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2982:       break;
2983:     default:
2984:       break;
2985:     }
2986:   } else if (Biscusp && Ciscusp) {
2987:     switch (product->type) {
2988:     case MATPRODUCT_AB:
2989:     case MATPRODUCT_AtB:
2990:     case MATPRODUCT_ABt:
2991:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2992:       break;
2993:     case MATPRODUCT_PtAP:
2994:     case MATPRODUCT_RARt:
2995:     case MATPRODUCT_ABC:
2996:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2997:       break;
2998:     default:
2999:       break;
3000:     }
3001:   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
3002:   PetscFunctionReturn(PETSC_SUCCESS);
3003: }

3005: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3006: {
3007:   PetscFunctionBegin;
3008:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3009:   PetscFunctionReturn(PETSC_SUCCESS);
3010: }

3012: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3013: {
3014:   PetscFunctionBegin;
3015:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3016:   PetscFunctionReturn(PETSC_SUCCESS);
3017: }

3019: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3020: {
3021:   PetscFunctionBegin;
3022:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3023:   PetscFunctionReturn(PETSC_SUCCESS);
3024: }

3026: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3027: {
3028:   PetscFunctionBegin;
3029:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3030:   PetscFunctionReturn(PETSC_SUCCESS);
3031: }

3033: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3034: {
3035:   PetscFunctionBegin;
3036:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3037:   PetscFunctionReturn(PETSC_SUCCESS);
3038: }

3040: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3041: {
3042:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3043:   if (i < n) y[idx[i]] += x[i];
3044: }

3046: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3047: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3048: {
3049:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
3050:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3051:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3052:   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
3053:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3054:   PetscBool                      compressed;
3055:   PetscInt                       nx, ny;

3057:   PetscFunctionBegin;
3058:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3059:   if (!a->nz) {
3060:     if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3061:     else PetscCall(VecSeq_HIP::Set(zz, 0));
3062:     PetscFunctionReturn(PETSC_SUCCESS);
3063:   }
3064:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3065:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3066:   if (!trans) {
3067:     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3068:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3069:   } else {
3070:     if (herm || !A->form_explicit_transpose) {
3071:       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3072:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3073:     } else {
3074:       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3075:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3076:     }
3077:   }
3078:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3079:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3080:   try {
3081:     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3082:     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3083:     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3085:     PetscCall(PetscLogGpuTimeBegin());
3086:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3087:       /* z = A x + beta y.
3088:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3089:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3090:       */
3091:       xptr = xarray;
3092:       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3093:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3094:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3095:           allocated to accommodate different uses. So we get the length info directly from mat.
3096:        */
3097:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3098:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3099:         nx             = mat->num_cols;
3100:         ny             = mat->num_rows;
3101:       }
3102:     } else {
3103:       /* z = A^T x + beta y
3104:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3105:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3106:        */
3107:       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3108:       dptr = zarray;
3109:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3110:       if (compressed) { /* Scatter x to work vector */
3111:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3112:         thrust::for_each(
3113: #if PetscDefined(HAVE_THRUST_ASYNC)
3114:           thrust::hip::par.on(PetscDefaultHipStream),
3115: #endif
3116:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3117:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3118:       }
3119:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3120:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3121:         nx             = mat->num_rows;
3122:         ny             = mat->num_cols;
3123:       }
3124:     }
3125:     /* csr_spmv does y = alpha op(A) x + beta y */
3126:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3127: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !(PETSC_PKG_HIP_VERSION_GT(6, 4, 3) && PETSC_PKG_HIP_VERSION_LE(7, 2, 0))
3128:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3129:       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3130:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3131:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3132:         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3133:                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3134:         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3135:         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3136:       } else {
3137:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3138:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3139:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3140:       }
3141:       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3142:                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3143: #else
3144:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3145:       nx             = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
3146:       ny             = mat->num_cols;
3147:       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3148: #endif
3149:     } else {
3150:       if (hipsparsestruct->nrows) {
3151:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3152:         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3153:       }
3154:     }
3155:     PetscCall(PetscLogGpuTimeEnd());

3157:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3158:       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
3159:         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3160:           PetscCall(VecSeq_HIP::Copy(yy, zz));      /* zz = yy */
3161:         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3162:           PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3163:         }
3164:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3165:         PetscCall(VecSeq_HIP::Set(zz, 0));
3166:       }

3168:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3169:       if (compressed) {
3170:         PetscCall(PetscLogGpuTimeBegin());
3171:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3172:            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3173:            prevent that. So I just add a ScatterAdd kernel.
3174:          */
3175: #if 0
3176:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3177:         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3178:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3179:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3180:                          VecHIPPlusEquals());
3181: #else
3182:         PetscInt n = matstruct->cprowIndices->size();
3183:         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3184: #endif
3185:         PetscCall(PetscLogGpuTimeEnd());
3186:       }
3187:     } else {
3188:       if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3189:     }
3190:     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3191:     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3192:     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3193:   } catch (char *ex) {
3194:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3195:   }
3196:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3197:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3198:   PetscFunctionReturn(PETSC_SUCCESS);
3199: }

3201: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3202: {
3203:   PetscFunctionBegin;
3204:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3205:   PetscFunctionReturn(PETSC_SUCCESS);
3206: }

3208: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3209: {
3210:   PetscFunctionBegin;
3211:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AssemblyEnd(A, mode));
3212:   PetscFunctionReturn(PETSC_SUCCESS);
3213: }

3215: /*@
3216:   MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3217:   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.

3219:   Collective

3221:   Input Parameters:
3222: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3223: . m    - number of rows
3224: . n    - number of columns
3225: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3226: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3228:   Output Parameter:
3229: . A - the matrix

3231:   Level: intermediate

3233:   Notes:
3234:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3235:   `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3236:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]

3238:   The AIJ format (compressed row storage), is fully compatible with standard Fortran
3239:   storage.  That is, the stored row and column indices can begin at
3240:   either one (as in Fortran) or zero.

3242:   Specify the preallocated storage with either `nz` or `nnz` (not both).
3243:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3244:   allocation.

3246: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3247: @*/
3248: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3249: {
3250:   return MatSeqAIJHIPSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
3251: }

3253: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3254: {
3255:   return MatSeqAIJHIPSPARSE_CUPM_t::Destroy(A);
3256: }

3258: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3259: {
3260:   PetscFunctionBegin;
3261:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
3262:   PetscFunctionReturn(PETSC_SUCCESS);
3263: }

3265: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3266: {
3267:   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3268:   Mat_SeqAIJHIPSPARSE *cy;
3269:   Mat_SeqAIJHIPSPARSE *cx;
3270:   PetscScalar         *ay;
3271:   const PetscScalar   *ax;
3272:   CsrMatrix           *csry, *csrx;

3274:   PetscFunctionBegin;
3275:   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3276:   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3277:   if (X->ops->axpy != Y->ops->axpy) {
3278:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3279:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3280:     PetscFunctionReturn(PETSC_SUCCESS);
3281:   }
3282:   /* if we are here, it means both matrices are bound to GPU */
3283:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3284:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3285:   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3286:   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3287:   csry = (CsrMatrix *)cy->mat->mat;
3288:   csrx = (CsrMatrix *)cx->mat->mat;
3289:   /* see if we can turn this into a hipblas axpy */
3290:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3291:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3292:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3293:     if (eq) str = SAME_NONZERO_PATTERN;
3294:   }
3295:   /* spgeam is buggy with one column */
3296:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3297:   if (str == SUBSET_NONZERO_PATTERN) {
3298:     PetscScalar b = 1.0;
3299: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3300:     size_t bufferSize;
3301:     void  *buffer;
3302: #endif

3304:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3305:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3306:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3307: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3308:     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3309:                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3310:     PetscCallHIP(hipMalloc(&buffer, bufferSize));
3311:     PetscCall(PetscLogGpuTimeBegin());
3312:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3313:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3314:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3315:     PetscCall(PetscLogGpuTimeEnd());
3316:     PetscCallHIP(hipFree(buffer));
3317: #else
3318:     PetscCall(PetscLogGpuTimeBegin());
3319:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3320:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3321:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3322:     PetscCall(PetscLogGpuTimeEnd());
3323: #endif
3324:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3325:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3326:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3327:   } else if (str == SAME_NONZERO_PATTERN) {
3328:     PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
3329:   } else {
3330:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3331:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3332:   }
3333:   PetscFunctionReturn(PETSC_SUCCESS);
3334: }

3336: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3337: {
3338:   PetscFunctionBegin;
3339:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Scale(Y, a));
3340:   PetscFunctionReturn(PETSC_SUCCESS);
3341: }

3343: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat A, Vec ll, Vec rr)
3344: {
3345:   PetscFunctionBegin;
3346:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
3347:   PetscFunctionReturn(PETSC_SUCCESS);
3348: }

3350: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3351: {
3352:   PetscFunctionBegin;
3353:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::ZeroEntries(A));
3354:   PetscFunctionReturn(PETSC_SUCCESS);
3355: }

3357: static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(Mat A, PetscMemType *m)
3358: {
3359:   PetscFunctionBegin;
3360:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetCurrentMemType(A, m));
3361:   PetscFunctionReturn(PETSC_SUCCESS);
3362: }

3364: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3365: {
3366:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3368:   PetscFunctionBegin;
3369:   if (A->factortype != MAT_FACTOR_NONE) {
3370:     A->boundtocpu = flg;
3371:     PetscFunctionReturn(PETSC_SUCCESS);
3372:   }
3373:   if (flg) {
3374:     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));

3376:     A->ops->scale                     = MatScale_SeqAIJ;
3377:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJ;
3378:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3379:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3380:     A->ops->mult                      = MatMult_SeqAIJ;
3381:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3382:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3383:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3384:     A->ops->multhermitiantranspose    = NULL;
3385:     A->ops->multhermitiantransposeadd = NULL;
3386:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3387:     A->ops->getcurrentmemtype         = NULL;
3388:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3389:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3390:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3391:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3392:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3393:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3394:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3395:   } else {
3396:     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3397:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJHIPSPARSE;
3398:     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3399:     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3400:     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3401:     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3402:     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3403:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3404:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3405:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3406:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3407:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3408:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3409:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3410:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3411:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3412:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3413:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3414:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3415:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3416:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3417:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3418:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3419:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3420:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3421:   }
3422:   A->boundtocpu = flg;
3423:   if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3424:   else a->inode.use = PETSC_FALSE;
3425:   PetscFunctionReturn(PETSC_SUCCESS);
3426: }

3428: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3429: {
3430:   Mat B;

3432:   PetscFunctionBegin;
3433:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3434:   if (reuse == MAT_INITIAL_MATRIX) {
3435:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3436:   } else if (reuse == MAT_REUSE_MATRIX) {
3437:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3438:   }
3439:   B = *newmat;
3440:   PetscCall(PetscFree(B->defaultvectype));
3441:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3442:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3443:     if (B->factortype == MAT_FACTOR_NONE) {
3444:       Mat_SeqAIJHIPSPARSE *spptr;
3445:       PetscCall(PetscNew(&spptr));
3446:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3447:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3448:       spptr->format = MAT_HIPSPARSE_CSR;
3449: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3450:       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3451: #else
3452:       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3453: #endif
3454:       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3455:       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;

3457:       B->spptr = spptr;
3458:     } else {
3459:       Mat_SeqAIJHIPSPARSETriFactors *spptr;

3461:       PetscCall(PetscNew(&spptr));
3462:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3463:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3464:       B->spptr = spptr;
3465:     }
3466:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3467:   }
3468:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJHIPSPARSE;
3469:   B->ops->destroy           = MatDestroy_SeqAIJHIPSPARSE;
3470:   B->ops->setoption         = MatSetOption_SeqAIJHIPSPARSE;
3471:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJHIPSPARSE;
3472:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJHIPSPARSE;
3473:   B->ops->duplicate         = MatDuplicate_SeqAIJHIPSPARSE;
3474:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;

3476:   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3477:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3478:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3479: #if defined(PETSC_HAVE_HYPRE)
3480:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3481: #endif
3482:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3483:   PetscFunctionReturn(PETSC_SUCCESS);
3484: }

3486: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3487: {
3488:   PetscFunctionBegin;
3489:   PetscCall(MatCreate_SeqAIJ(B));
3490:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3491:   PetscFunctionReturn(PETSC_SUCCESS);
3492: }

3494: /*MC
3495:    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs

3497:    A matrix type whose data resides on AMD GPUs. These matrices can be in either
3498:    CSR, ELL, or Hybrid format.
3499:    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.

3501:    Options Database Keys:
3502: +  -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3503: .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3504:                                        Other options include ell (ellpack) or hyb (hybrid).
3505: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3506: -  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU

3508:   Level: beginner

3510: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3511: M*/

3513: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3514: {
3515:   PetscFunctionBegin;
3516:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3517:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3518:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3519:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3520:   PetscFunctionReturn(PETSC_SUCCESS);
3521: }

3523: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3524: {
3525:   Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);

3527:   PetscFunctionBegin;
3528:   if (cusp) {
3529:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3530:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3531:     delete cusp->workVector;
3532:     delete cusp->rowoffsets_gpu;
3533:     delete cusp->csr2csc_i;
3534:     delete cusp->coords;
3535:     if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3536:     PetscCall(PetscFree(mat->spptr));
3537:   }
3538:   PetscFunctionReturn(PETSC_SUCCESS);
3539: }

3541: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3542: {
3543:   PetscFunctionBegin;
3544:   if (*mat) {
3545:     delete (*mat)->values;
3546:     delete (*mat)->column_indices;
3547:     delete (*mat)->row_offsets;
3548:     delete *mat;
3549:     *mat = 0;
3550:   }
3551:   PetscFunctionReturn(PETSC_SUCCESS);
3552: }

3554: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3555: {
3556:   PetscFunctionBegin;
3557:   if (*trifactor) {
3558:     if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3559:     if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3560:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3561:     PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3562:     PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3563:     PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3564:     PetscCall(PetscFree(*trifactor));
3565:   }
3566:   PetscFunctionReturn(PETSC_SUCCESS);
3567: }

3569: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3570: {
3571:   CsrMatrix *mat;

3573:   PetscFunctionBegin;
3574:   if (*matstruct) {
3575:     if ((*matstruct)->mat) {
3576:       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3577:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3578:         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3579:       } else {
3580:         mat = (CsrMatrix *)(*matstruct)->mat;
3581:         PetscCall(CsrMatrix_Destroy(&mat));
3582:       }
3583:     }
3584:     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3585:     delete (*matstruct)->cprowIndices;
3586:     PetscCallHIP(hipFree((*matstruct)->alpha_one));
3587:     PetscCallHIP(hipFree((*matstruct)->beta_zero));
3588:     PetscCallHIP(hipFree((*matstruct)->beta_one));

3590:     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3591:     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3592:     for (int i = 0; i < 3; i++) {
3593:       if (mdata->hipSpMV[i].initialized) {
3594:         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3595:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3596:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3597:       }
3598:     }
3599:     delete *matstruct;
3600:     *matstruct = NULL;
3601:   }
3602:   PetscFunctionReturn(PETSC_SUCCESS);
3603: }

3605: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3606: {
3607:   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;

3609:   PetscFunctionBegin;
3610:   if (fs) {
3611:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3612:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3613:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3614:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3615:     delete fs->rpermIndices;
3616:     delete fs->cpermIndices;
3617:     delete fs->workVector;
3618:     fs->rpermIndices  = NULL;
3619:     fs->cpermIndices  = NULL;
3620:     fs->workVector    = NULL;
3621:     fs->init_dev_prop = PETSC_FALSE;
3622: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3623:     PetscCallHIP(hipFree(fs->csrRowPtr));
3624:     PetscCallHIP(hipFree(fs->csrColIdx));
3625:     PetscCallHIP(hipFree(fs->csrVal));
3626:     PetscCallHIP(hipFree(fs->X));
3627:     PetscCallHIP(hipFree(fs->Y));
3628:     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3629:     PetscCallHIP(hipFree(fs->spsvBuffer_L));
3630:     PetscCallHIP(hipFree(fs->spsvBuffer_U));
3631:     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3632:     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3633:     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3634:     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3635:     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3636:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3637:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3638:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3639:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3640:     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3641:     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3642:     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3643:     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));

3645:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3646:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3647: #endif
3648:   }
3649:   PetscFunctionReturn(PETSC_SUCCESS);
3650: }

3652: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3653: {
3654:   hipsparseHandle_t handle;

3656:   PetscFunctionBegin;
3657:   if (*trifactors) {
3658:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3659:     if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3660:     PetscCall(PetscFree(*trifactors));
3661:   }
3662:   PetscFunctionReturn(PETSC_SUCCESS);
3663: }

3665: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3666: {
3667:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3669:   PetscFunctionBegin;
3670:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3671:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3672:   if (destroy) {
3673:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3674:     delete cusp->csr2csc_i;
3675:     cusp->csr2csc_i = NULL;
3676:   }
3677:   A->transupdated = PETSC_FALSE;
3678:   PetscFunctionReturn(PETSC_SUCCESS);
3679: }

3681: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3682: {
3683:   PetscFunctionBegin;
3684:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3685:   PetscFunctionReturn(PETSC_SUCCESS);
3686: }

3688: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3689: {
3690:   PetscFunctionBegin;
3691:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3692:   PetscFunctionReturn(PETSC_SUCCESS);
3693: }

3695: /*@C
3696:   MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.

3698:   Not Collective

3700:   Input Parameters:
3701: + A          - the matrix
3702: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

3704:   Output Parameters:
3705: + i - the CSR row pointers
3706: - j - the CSR column indices

3708:   Level: developer

3710:   Note:
3711:   When compressed is true, the CSR structure does not contain empty rows

3713: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3714: @*/
3715: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3716: {
3717:   PetscFunctionBegin;
3718:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3719:   PetscFunctionReturn(PETSC_SUCCESS);
3720: }

3722: /*@C
3723:   MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`

3725:   Not Collective

3727:   Input Parameters:
3728: + A          - the matrix
3729: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3730: . i          - the CSR row pointers
3731: - j          - the CSR column indices

3733:   Level: developer

3735: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3736: @*/
3737: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3738: {
3739:   PetscFunctionBegin;
3740:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3741:   PetscFunctionReturn(PETSC_SUCCESS);
3742: }

3744: /*@C
3745:   MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3747:   Not Collective

3749:   Input Parameter:
3750: . A - a `MATSEQAIJHIPSPARSE` matrix

3752:   Output Parameter:
3753: . a - pointer to the device data

3755:   Level: developer

3757:   Note:
3758:   May trigger host-device copies if the up-to-date matrix data is on host

3760: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3761: @*/
3762: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3763: {
3764:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayRead(A, a);
3765: }

3767: /*@C
3768:   MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`

3770:   Not Collective

3772:   Input Parameters:
3773: + A - a `MATSEQAIJHIPSPARSE` matrix
3774: - a - pointer to the device data

3776:   Level: developer

3778: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3779: @*/
3780: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3781: {
3782:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayRead(A, a);
3783: }

3785: /*@C
3786:   MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3788:   Not Collective

3790:   Input Parameter:
3791: . A - a `MATSEQAIJHIPSPARSE` matrix

3793:   Output Parameter:
3794: . a - pointer to the device data

3796:   Level: developer

3798:   Note:
3799:   May trigger host-device copies if up-to-date matrix data is on host

3801: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3802: @*/
3803: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3804: {
3805:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArray(A, a);
3806: }
3807: /*@C
3808:   MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`

3810:   Not Collective

3812:   Input Parameters:
3813: + A - a `MATSEQAIJHIPSPARSE` matrix
3814: - a - pointer to the device data

3816:   Level: developer

3818: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
3819: @*/
3820: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
3821: {
3822:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArray(A, a);
3823: }

3825: /*@C
3826:   MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3828:   Not Collective

3830:   Input Parameter:
3831: . A - a `MATSEQAIJHIPSPARSE` matrix

3833:   Output Parameter:
3834: . a - pointer to the device data

3836:   Level: developer

3838:   Note:
3839:   Does not trigger host-device copies and flags data validity on the GPU

3841: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
3842: @*/
3843: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
3844: {
3845:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayWrite(A, a);
3846: }

3848: /*@C
3849:   MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`

3851:   Not Collective

3853:   Input Parameters:
3854: + A - a `MATSEQAIJHIPSPARSE` matrix
3855: - a - pointer to the device data

3857:   Level: developer

3859: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
3860: @*/
3861: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
3862: {
3863:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3864: }

3866: struct IJCompare4 {
3867:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3868:   {
3869:     if (t1.get<0>() < t2.get<0>()) return true;
3870:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3871:     return false;
3872:   }
3873: };

3875: struct Shift {
3876:   int _shift;

3878:   Shift(int shift) : _shift(shift) { }
3879:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
3880: };

3882: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3883: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3884: {
3885:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3886:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
3887:   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
3888:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
3889:   PetscInt                       Annz, Bnnz;
3890:   PetscInt                       i, m, n, zero = 0;

3892:   PetscFunctionBegin;
3895:   PetscAssertPointer(C, 4);
3896:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3897:   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
3898:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3899:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3900:   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3901:   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3902:   if (reuse == MAT_INITIAL_MATRIX) {
3903:     m = A->rmap->n;
3904:     n = A->cmap->n + B->cmap->n;
3905:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
3906:     PetscCall(MatSetSizes(*C, m, n, m, n));
3907:     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
3908:     c                       = (Mat_SeqAIJ *)(*C)->data;
3909:     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
3910:     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
3911:     Ccsr                    = new CsrMatrix;
3912:     Cmat->cprowIndices      = NULL;
3913:     c->compressedrow.use    = PETSC_FALSE;
3914:     c->compressedrow.nrows  = 0;
3915:     c->compressedrow.i      = NULL;
3916:     c->compressedrow.rindex = NULL;
3917:     Ccusp->workVector       = NULL;
3918:     Ccusp->nrows            = m;
3919:     Ccusp->mat              = Cmat;
3920:     Ccusp->mat->mat         = Ccsr;
3921:     Ccsr->num_rows          = m;
3922:     Ccsr->num_cols          = n;
3923:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
3924:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
3925:     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
3926:     PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3927:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3928:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3929:     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3930:     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
3931:     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3932:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3933:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
3934:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3935:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");

3937:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
3938:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
3939:     Annz                 = (PetscInt)Acsr->column_indices->size();
3940:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
3941:     c->nz                = Annz + Bnnz;
3942:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
3943:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3944:     Ccsr->values         = new THRUSTARRAY(c->nz);
3945:     Ccsr->num_entries    = c->nz;
3946:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
3947:     if (c->nz) {
3948:       auto              Acoo = new THRUSTINTARRAY32(Annz);
3949:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
3950:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
3951:       THRUSTINTARRAY32 *Aroff, *Broff;

3953:       if (a->compressedrow.use) { /* need full row offset */
3954:         if (!Acusp->rowoffsets_gpu) {
3955:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3956:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3957:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3958:         }
3959:         Aroff = Acusp->rowoffsets_gpu;
3960:       } else Aroff = Acsr->row_offsets;
3961:       if (b->compressedrow.use) { /* need full row offset */
3962:         if (!Bcusp->rowoffsets_gpu) {
3963:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3964:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3965:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3966:         }
3967:         Broff = Bcusp->rowoffsets_gpu;
3968:       } else Broff = Bcsr->row_offsets;
3969:       PetscCall(PetscLogGpuTimeBegin());
3970:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
3971:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
3972:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3973:       auto Aperm = thrust::make_constant_iterator(1);
3974:       auto Bperm = thrust::make_constant_iterator(0);
3975:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3976:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3977:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
3978:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3979:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3980:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
3981:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3982:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3983:       auto p1    = Ccusp->coords->begin();
3984:       auto p2    = Ccusp->coords->begin();
3985:       thrust::advance(p2, Annz);
3986:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
3987:       auto cci = thrust::make_counting_iterator(zero);
3988:       auto cce = thrust::make_counting_iterator(c->nz);
3989: #if 0 //Errors on SUMMIT cuda 11.1.0
3990:       PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
3991: #else
3992:       auto pred = [](const int &x) { return x; };
3993:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3994:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3995: #endif
3996:       PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
3997:       PetscCall(PetscLogGpuTimeEnd());
3998:       delete wPerm;
3999:       delete Acoo;
4000:       delete Bcoo;
4001:       delete Ccoo;
4002:       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));

4004:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4005:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4006:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4007:         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4008:         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4009:         CsrMatrix                     *CcsrT = new CsrMatrix;
4010:         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4011:         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4013:         (*C)->form_explicit_transpose = PETSC_TRUE;
4014:         (*C)->transupdated            = PETSC_TRUE;
4015:         Ccusp->rowoffsets_gpu         = NULL;
4016:         CmatT->cprowIndices           = NULL;
4017:         CmatT->mat                    = CcsrT;
4018:         CcsrT->num_rows               = n;
4019:         CcsrT->num_cols               = m;
4020:         CcsrT->num_entries            = c->nz;
4021:         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4022:         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4023:         CcsrT->values                 = new THRUSTARRAY(c->nz);

4025:         PetscCall(PetscLogGpuTimeBegin());
4026:         auto rT = CcsrT->row_offsets->begin();
4027:         if (AT) {
4028:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4029:           thrust::advance(rT, -1);
4030:         }
4031:         if (BT) {
4032:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4033:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4034:           thrust::copy(titb, tite, rT);
4035:         }
4036:         auto cT = CcsrT->column_indices->begin();
4037:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4038:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4039:         auto vT = CcsrT->values->begin();
4040:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4041:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4042:         PetscCall(PetscLogGpuTimeEnd());

4044:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4045:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4046:         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4047:         PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4048:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4049:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4050:         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4051:         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4052:         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

4054:         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4055:         Ccusp->matTranspose = CmatT;
4056:       }
4057:     }

4059:     c->free_a = PETSC_TRUE;
4060:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4061:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4062:     c->free_ij = PETSC_TRUE;
4063:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4064:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4065:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4066:       ii = *Ccsr->row_offsets;
4067:       jj = *Ccsr->column_indices;
4068:       PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4069:       PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4070:     } else {
4071:       PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4072:       PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4073:     }
4074:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4075:     PetscCall(PetscMalloc1(m, &c->ilen));
4076:     PetscCall(PetscMalloc1(m, &c->imax));
4077:     c->maxnz         = c->nz;
4078:     c->nonzerorowcnt = 0;
4079:     c->rmax          = 0;
4080:     for (i = 0; i < m; i++) {
4081:       const PetscInt nn = c->i[i + 1] - c->i[i];
4082:       c->ilen[i] = c->imax[i] = nn;
4083:       c->nonzerorowcnt += (PetscInt)!!nn;
4084:       c->rmax = PetscMax(c->rmax, nn);
4085:     }
4086:     PetscCall(PetscMalloc1(c->nz, &c->a));
4087:     (*C)->nonzerostate++;
4088:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4089:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4090:     Ccusp->nonzerostate = (*C)->nonzerostate;
4091:     (*C)->preallocated  = PETSC_TRUE;
4092:   } else {
4093:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4094:     c = (Mat_SeqAIJ *)(*C)->data;
4095:     if (c->nz) {
4096:       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4097:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4098:       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4099:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4100:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4101:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4102:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4103:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4104:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4105:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4106:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4107:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4108:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4109:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4110:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4111:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4112:       auto pmid = Ccusp->coords->begin();
4113:       thrust::advance(pmid, Acsr->num_entries);
4114:       PetscCall(PetscLogGpuTimeBegin());
4115:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4116:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4117:       thrust::for_each(zibait, zieait, VecHIPEquals());
4118:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4119:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4120:       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4121:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4122:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4123:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4124:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4125:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4126:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4127:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4128:         auto       vT    = CcsrT->values->begin();
4129:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4130:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4131:         (*C)->transupdated = PETSC_TRUE;
4132:       }
4133:       PetscCall(PetscLogGpuTimeEnd());
4134:     }
4135:   }
4136:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4137:   (*C)->assembled     = PETSC_TRUE;
4138:   (*C)->was_assembled = PETSC_FALSE;
4139:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4140:   PetscFunctionReturn(PETSC_SUCCESS);
4141: }

4143: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4144: {
4145:   PetscFunctionBegin;
4146:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
4147:   PetscFunctionReturn(PETSC_SUCCESS);
4148: }