Actual source code: aijhipsparse.hip.cpp

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the HIPSPARSE library,
  4:   Portions of this code are under:
  5:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  6: */
  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/mat/impls/dense/seq/dense.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 15: #include <thrust/adjacent_difference.h>
 16: #include <thrust/iterator/transform_iterator.h>
 17: #if PETSC_CPP_VERSION >= 14
 18:   #define PETSC_HAVE_THRUST_ASYNC 1
 19:   #include <thrust/async/for_each.h>
 20: #endif
 21: #include <thrust/iterator/constant_iterator.h>
 22: #include <thrust/iterator/discard_iterator.h>
 23: #include <thrust/binary_search.h>
 24: #include <thrust/remove.h>
 25: #include <thrust/sort.h>
 26: #include <thrust/unique.h>

 28: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
 29: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
 30: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
 31: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};

 33: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 34: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 35: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 36: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 38: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
 43: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
 44: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
 45: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
 46: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 47: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 48: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 49: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 52: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
 53: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
 55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
 56: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
 57: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
 58: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
 59: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
 60: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
 61: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 62: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
 63: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 64: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);

 66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
 67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);

 69: /*
 70: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
 71: {
 72:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 74:   PetscFunctionBegin;
 75:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 76:   hipsparsestruct->stream = stream;
 77:   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
 78:   PetscFunctionReturn(PETSC_SUCCESS);
 79: }

 81: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
 82: {
 83:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 85:   PetscFunctionBegin;
 86:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 87:   if (hipsparsestruct->handle != handle) {
 88:     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
 89:     hipsparsestruct->handle = handle;
 90:   }
 91:   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
 92:   PetscFunctionReturn(PETSC_SUCCESS);
 93: }

 95: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
 96: {
 97:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
 98:   PetscBool            flg;

100:   PetscFunctionBegin;
101:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102:   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103:   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104:   PetscFunctionReturn(PETSC_SUCCESS);
105: }
106: */

108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

112:   PetscFunctionBegin;
113:   switch (op) {
114:   case MAT_HIPSPARSE_MULT:
115:     hipsparsestruct->format = format;
116:     break;
117:   case MAT_HIPSPARSE_ALL:
118:     hipsparsestruct->format = format;
119:     break;
120:   default:
121:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122:   }
123:   PetscFunctionReturn(PETSC_SUCCESS);
124: }

126: /*@
127:   MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128:   operation. Only the `MatMult()` operation can use different GPU storage formats

130:   Not Collective

132:   Input Parameters:
133: + A      - Matrix of type `MATSEQAIJHIPSPARSE`
134: . op     - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135:          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)

138:   Level: intermediate

140: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
144:   PetscFunctionBegin;
146:   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147:   PetscFunctionReturn(PETSC_SUCCESS);
148: }

150: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151: {
152:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

154:   PetscFunctionBegin;
155:   hipsparsestruct->use_cpu_solve = use_cpu;
156:   PetscFunctionReturn(PETSC_SUCCESS);
157: }

159: /*@
160:   MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.

162:   Input Parameters:
163: + A       - Matrix of type `MATSEQAIJHIPSPARSE`
164: - use_cpu - set flag for using the built-in CPU `MatSolve()`

166:   Level: intermediate

168:   Notes:
169:   The hipSparse LU solver currently computes the factors with the built-in CPU method
170:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171:   This method to specifies if the solve is done on the CPU or GPU (GPU is the default).

173: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
177:   PetscFunctionBegin;
179:   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180:   PetscFunctionReturn(PETSC_SUCCESS);
181: }

183: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184: {
185:   PetscFunctionBegin;
186:   switch (op) {
187:   case MAT_FORM_EXPLICIT_TRANSPOSE:
188:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190:     A->form_explicit_transpose = flg;
191:     break;
192:   default:
193:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194:     break;
195:   }
196:   PetscFunctionReturn(PETSC_SUCCESS);
197: }

199: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200: {
201:   PetscBool            row_identity, col_identity;
202:   Mat_SeqAIJ          *b     = (Mat_SeqAIJ *)B->data;
203:   IS                   isrow = b->row, iscol = b->col;
204:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;

206:   PetscFunctionBegin;
207:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209:   B->offloadmask = PETSC_OFFLOAD_CPU;
210:   /* determine which version of MatSolve needs to be used. */
211:   PetscCall(ISIdentity(isrow, &row_identity));
212:   PetscCall(ISIdentity(iscol, &col_identity));
213:   if (!hipsparsestruct->use_cpu_solve) {
214:     if (row_identity && col_identity) {
215:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217:     } else {
218:       B->ops->solve          = MatSolve_SeqAIJHIPSPARSE;
219:       B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220:     }
221:   }
222:   B->ops->matsolve          = NULL;
223:   B->ops->matsolvetranspose = NULL;

225:   /* get the triangular factors */
226:   if (!hipsparsestruct->use_cpu_solve) { PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); }
227:   PetscFunctionReturn(PETSC_SUCCESS);
228: }

230: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
231: {
232:   MatHIPSPARSEStorageFormat format;
233:   PetscBool                 flg;
234:   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

236:   PetscFunctionBegin;
237:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238:   if (A->factortype == MAT_FACTOR_NONE) {
239:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243:     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244:     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245:     PetscCall(
246:       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247:     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248:     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249:     PetscCall(
250:       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251:     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252:     /*
253:     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254:     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255:     */
256:   }
257:   PetscOptionsHeadEnd();
258:   PetscFunctionReturn(PETSC_SUCCESS);
259: }

261: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
264:   PetscInt                            n                   = A->rmap->n;
265:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267:   const PetscInt                     *ai = a->i, *aj = a->j, *vi;
268:   const MatScalar                    *aa = a->a, *v;
269:   PetscInt                           *AiLo, *AjLo;
270:   PetscInt                            i, nz, nzLower, offset, rowOffset;

272:   PetscFunctionBegin;
273:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275:     try {
276:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277:       nzLower = n + ai[n] - ai[1];
278:       if (!loTriFactor) {
279:         PetscScalar *AALo;
280:         PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));

282:         /* Allocate Space for the lower triangular matrix */
283:         PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284:         PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));

286:         /* Fill the lower triangular matrix */
287:         AiLo[0]   = (PetscInt)0;
288:         AiLo[n]   = nzLower;
289:         AjLo[0]   = (PetscInt)0;
290:         AALo[0]   = (MatScalar)1.0;
291:         v         = aa;
292:         vi        = aj;
293:         offset    = 1;
294:         rowOffset = 1;
295:         for (i = 1; i < n; i++) {
296:           nz = ai[i + 1] - ai[i];
297:           /* additional 1 for the term on the diagonal */
298:           AiLo[i] = rowOffset;
299:           rowOffset += nz + 1;

301:           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302:           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303:           offset += nz;
304:           AjLo[offset] = (PetscInt)i;
305:           AALo[offset] = (MatScalar)1.0;
306:           offset += 1;
307:           v += nz;
308:           vi += nz;
309:         }

311:         /* allocate space for the triangular factor information */
312:         PetscCall(PetscNew(&loTriFactor));
313:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314:         /* Create the matrix description */
315:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

321:         /* set the operation */
322:         loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

324:         /* set the matrix */
325:         loTriFactor->csrMat                 = new CsrMatrix;
326:         loTriFactor->csrMat->num_rows       = n;
327:         loTriFactor->csrMat->num_cols       = n;
328:         loTriFactor->csrMat->num_entries    = nzLower;
329:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
330:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331:         loTriFactor->csrMat->values         = new THRUSTARRAY(nzLower);

333:         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334:         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335:         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);

337:         /* Create the solve analysis information */
338:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

344:         /* perform the solve analysis */
345:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

348:         PetscCallHIP(WaitForHIP());
349:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

351:         /* assign the pointer */
352:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353:         loTriFactor->AA_h                                           = AALo;
354:         PetscCallHIP(hipHostFree(AiLo));
355:         PetscCallHIP(hipHostFree(AjLo));
356:         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357:       } else { /* update values only */
358:         if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359:         /* Fill the lower triangular matrix */
360:         loTriFactor->AA_h[0] = 1.0;
361:         v                    = aa;
362:         vi                   = aj;
363:         offset               = 1;
364:         for (i = 1; i < n; i++) {
365:           nz = ai[i + 1] - ai[i];
366:           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367:           offset += nz;
368:           loTriFactor->AA_h[offset] = 1.0;
369:           offset += 1;
370:           v += nz;
371:         }
372:         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373:         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374:       }
375:     } catch (char *ex) {
376:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377:     }
378:   }
379:   PetscFunctionReturn(PETSC_SUCCESS);
380: }

382: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383: {
384:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
385:   PetscInt                            n                   = A->rmap->n;
386:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388:   const PetscInt                     *aj = a->j, *adiag = a->diag, *vi;
389:   const MatScalar                    *aa = a->a, *v;
390:   PetscInt                           *AiUp, *AjUp;
391:   PetscInt                            i, nz, nzUpper, offset;

393:   PetscFunctionBegin;
394:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396:     try {
397:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
398:       nzUpper = adiag[0] - adiag[n];
399:       if (!upTriFactor) {
400:         PetscScalar *AAUp;
401:         PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));

403:         /* Allocate Space for the upper triangular matrix */
404:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
405:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

407:         /* Fill the upper triangular matrix */
408:         AiUp[0] = (PetscInt)0;
409:         AiUp[n] = nzUpper;
410:         offset  = nzUpper;
411:         for (i = n - 1; i >= 0; i--) {
412:           v  = aa + adiag[i + 1] + 1;
413:           vi = aj + adiag[i + 1] + 1;
414:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
415:           offset -= (nz + 1);               /* decrement the offset */

417:           /* first, set the diagonal elements */
418:           AjUp[offset] = (PetscInt)i;
419:           AAUp[offset] = (MatScalar)1. / v[nz];
420:           AiUp[i]      = AiUp[i + 1] - (nz + 1);

422:           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
423:           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
424:         }

426:         /* allocate space for the triangular factor information */
427:         PetscCall(PetscNew(&upTriFactor));
428:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

430:         /* Create the matrix description */
431:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
432:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
433:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
434:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
435:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

437:         /* set the operation */
438:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

440:         /* set the matrix */
441:         upTriFactor->csrMat                 = new CsrMatrix;
442:         upTriFactor->csrMat->num_rows       = n;
443:         upTriFactor->csrMat->num_cols       = n;
444:         upTriFactor->csrMat->num_entries    = nzUpper;
445:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(n + 1);
446:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
447:         upTriFactor->csrMat->values         = new THRUSTARRAY(nzUpper);
448:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
449:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
450:         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);

452:         /* Create the solve analysis information */
453:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
454:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
455:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
456:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
457:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

459:         /* perform the solve analysis */
460:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
461:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

463:         PetscCallHIP(WaitForHIP());
464:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

466:         /* assign the pointer */
467:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
468:         upTriFactor->AA_h                                           = AAUp;
469:         PetscCallHIP(hipHostFree(AiUp));
470:         PetscCallHIP(hipHostFree(AjUp));
471:         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
472:       } else {
473:         if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
474:         /* Fill the upper triangular matrix */
475:         offset = nzUpper;
476:         for (i = n - 1; i >= 0; i--) {
477:           v  = aa + adiag[i + 1] + 1;
478:           nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
479:           offset -= (nz + 1);               /* decrement the offset */

481:           /* first, set the diagonal elements */
482:           upTriFactor->AA_h[offset] = 1. / v[nz];
483:           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
484:         }
485:         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
486:         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
487:       }
488:     } catch (char *ex) {
489:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
490:     }
491:   }
492:   PetscFunctionReturn(PETSC_SUCCESS);
493: }

495: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
496: {
497:   PetscBool                      row_identity, col_identity;
498:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
499:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
500:   IS                             isrow = a->row, iscol = a->icol;
501:   PetscInt                       n = A->rmap->n;

503:   PetscFunctionBegin;
504:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
505:   PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
506:   PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));

508:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
509:   hipsparseTriFactors->nnz = a->nz;

511:   A->offloadmask = PETSC_OFFLOAD_BOTH;
512:   /* lower triangular indices */
513:   PetscCall(ISIdentity(isrow, &row_identity));
514:   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
515:     const PetscInt *r;

517:     PetscCall(ISGetIndices(isrow, &r));
518:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
519:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
520:     PetscCall(ISRestoreIndices(isrow, &r));
521:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
522:   }
523:   /* upper triangular indices */
524:   PetscCall(ISIdentity(iscol, &col_identity));
525:   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
526:     const PetscInt *c;

528:     PetscCall(ISGetIndices(iscol, &c));
529:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
530:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
531:     PetscCall(ISRestoreIndices(iscol, &c));
532:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
533:   }
534:   PetscFunctionReturn(PETSC_SUCCESS);
535: }

537: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
538: {
539:   Mat_SeqAIJ                         *a                   = (Mat_SeqAIJ *)A->data;
540:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
541:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
542:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
543:   PetscInt                           *AiUp, *AjUp;
544:   PetscScalar                        *AAUp;
545:   PetscScalar                        *AALo;
546:   PetscInt                            nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
547:   Mat_SeqSBAIJ                       *b  = (Mat_SeqSBAIJ *)A->data;
548:   const PetscInt                     *ai = b->i, *aj = b->j, *vj;
549:   const MatScalar                    *aa = b->a, *v;

551:   PetscFunctionBegin;
552:   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
553:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
554:     try {
555:       PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
556:       PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
557:       if (!upTriFactor && !loTriFactor) {
558:         /* Allocate Space for the upper triangular matrix */
559:         PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
560:         PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));

562:         /* Fill the upper triangular matrix */
563:         AiUp[0] = (PetscInt)0;
564:         AiUp[n] = nzUpper;
565:         offset  = 0;
566:         for (i = 0; i < n; i++) {
567:           /* set the pointers */
568:           v  = aa + ai[i];
569:           vj = aj + ai[i];
570:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

572:           /* first, set the diagonal elements */
573:           AjUp[offset] = (PetscInt)i;
574:           AAUp[offset] = (MatScalar)1.0 / v[nz];
575:           AiUp[i]      = offset;
576:           AALo[offset] = (MatScalar)1.0 / v[nz];

578:           offset += 1;
579:           if (nz > 0) {
580:             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
581:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
582:             for (j = offset; j < offset + nz; j++) {
583:               AAUp[j] = -AAUp[j];
584:               AALo[j] = AAUp[j] / v[nz];
585:             }
586:             offset += nz;
587:           }
588:         }

590:         /* allocate space for the triangular factor information */
591:         PetscCall(PetscNew(&upTriFactor));
592:         upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

594:         /* Create the matrix description */
595:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
596:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
597:         PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
598:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
599:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));

601:         /* set the matrix */
602:         upTriFactor->csrMat                 = new CsrMatrix;
603:         upTriFactor->csrMat->num_rows       = A->rmap->n;
604:         upTriFactor->csrMat->num_cols       = A->cmap->n;
605:         upTriFactor->csrMat->num_entries    = a->nz;
606:         upTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
607:         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
608:         upTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
609:         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
610:         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
611:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);

613:         /* set the operation */
614:         upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

616:         /* Create the solve analysis information */
617:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
618:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
619:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
620:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
621:         PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));

623:         /* perform the solve analysis */
624:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
625:                                                     upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

627:         PetscCallHIP(WaitForHIP());
628:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

630:         /* assign the pointer */
631:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;

633:         /* allocate space for the triangular factor information */
634:         PetscCall(PetscNew(&loTriFactor));
635:         loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

637:         /* Create the matrix description */
638:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
639:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
640:         PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
641:         PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
642:         PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));

644:         /* set the operation */
645:         loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;

647:         /* set the matrix */
648:         loTriFactor->csrMat                 = new CsrMatrix;
649:         loTriFactor->csrMat->num_rows       = A->rmap->n;
650:         loTriFactor->csrMat->num_cols       = A->cmap->n;
651:         loTriFactor->csrMat->num_entries    = a->nz;
652:         loTriFactor->csrMat->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
653:         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
654:         loTriFactor->csrMat->values         = new THRUSTARRAY(a->nz);
655:         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
656:         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
657:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);

659:         /* Create the solve analysis information */
660:         PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
661:         PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
662:         PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
663:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
664:         PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));

666:         /* perform the solve analysis */
667:         PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
668:                                                     loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

670:         PetscCallHIP(WaitForHIP());
671:         PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

673:         /* assign the pointer */
674:         ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;

676:         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
677:         PetscCallHIP(hipHostFree(AiUp));
678:         PetscCallHIP(hipHostFree(AjUp));
679:       } else {
680:         /* Fill the upper triangular matrix */
681:         offset = 0;
682:         for (i = 0; i < n; i++) {
683:           /* set the pointers */
684:           v  = aa + ai[i];
685:           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */

687:           /* first, set the diagonal elements */
688:           AAUp[offset] = 1.0 / v[nz];
689:           AALo[offset] = 1.0 / v[nz];

691:           offset += 1;
692:           if (nz > 0) {
693:             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
694:             for (j = offset; j < offset + nz; j++) {
695:               AAUp[j] = -AAUp[j];
696:               AALo[j] = AAUp[j] / v[nz];
697:             }
698:             offset += nz;
699:           }
700:         }
701:         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
702:         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703:         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
704:         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
705:         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
706:       }
707:       PetscCallHIP(hipHostFree(AAUp));
708:       PetscCallHIP(hipHostFree(AALo));
709:     } catch (char *ex) {
710:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
711:     }
712:   }
713:   PetscFunctionReturn(PETSC_SUCCESS);
714: }

716: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
717: {
718:   PetscBool                      perm_identity;
719:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
720:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
721:   IS                             ip                  = a->row;
722:   PetscInt                       n                   = A->rmap->n;

724:   PetscFunctionBegin;
725:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
726:   PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
727:   if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
728:   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;

730:   A->offloadmask = PETSC_OFFLOAD_BOTH;
731:   /* lower triangular indices */
732:   PetscCall(ISIdentity(ip, &perm_identity));
733:   if (!perm_identity) {
734:     IS              iip;
735:     const PetscInt *irip, *rip;

737:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
738:     PetscCall(ISGetIndices(iip, &irip));
739:     PetscCall(ISGetIndices(ip, &rip));
740:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
741:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
742:     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
743:     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
744:     PetscCall(ISRestoreIndices(iip, &irip));
745:     PetscCall(ISDestroy(&iip));
746:     PetscCall(ISRestoreIndices(ip, &rip));
747:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
748:   }
749:   PetscFunctionReturn(PETSC_SUCCESS);
750: }

752: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
753: {
754:   PetscBool   perm_identity;
755:   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
756:   IS          ip = b->row;

758:   PetscFunctionBegin;
759:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
760:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
761:   B->offloadmask = PETSC_OFFLOAD_CPU;
762:   /* determine which version of MatSolve needs to be used. */
763:   PetscCall(ISIdentity(ip, &perm_identity));
764:   if (perm_identity) {
765:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
766:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
767:     B->ops->matsolve          = NULL;
768:     B->ops->matsolvetranspose = NULL;
769:   } else {
770:     B->ops->solve             = MatSolve_SeqAIJHIPSPARSE;
771:     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE;
772:     B->ops->matsolve          = NULL;
773:     B->ops->matsolvetranspose = NULL;
774:   }

776:   /* get the triangular factors */
777:   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
778:   PetscFunctionReturn(PETSC_SUCCESS);
779: }

781: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
782: {
783:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
784:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
785:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
786:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
787:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
788:   hipsparseIndexBase_t                indexBase;
789:   hipsparseMatrixType_t               matrixType;
790:   hipsparseFillMode_t                 fillMode;
791:   hipsparseDiagType_t                 diagType;

793:   PetscFunctionBegin;
794:   /* allocate space for the transpose of the lower triangular factor */
795:   PetscCall(PetscNew(&loTriFactorT));
796:   loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

798:   /* set the matrix descriptors of the lower triangular factor */
799:   matrixType = hipsparseGetMatType(loTriFactor->descr);
800:   indexBase  = hipsparseGetMatIndexBase(loTriFactor->descr);
801:   fillMode   = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
802:   diagType   = hipsparseGetMatDiagType(loTriFactor->descr);

804:   /* Create the matrix description */
805:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
806:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
807:   PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
808:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
809:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));

811:   /* set the operation */
812:   loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

814:   /* allocate GPU space for the CSC of the lower triangular factor*/
815:   loTriFactorT->csrMat                 = new CsrMatrix;
816:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
817:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
818:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
819:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
820:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
821:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

823:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
824:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
825: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
826:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
827:                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
828:                                                   loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
829:   PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
830: #endif
831: */
832:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

834:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
835:                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
836: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
837:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
838:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
839: #else
840:                                        loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
841: #endif

843:   PetscCallHIP(WaitForHIP());
844:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

846:   /* Create the solve analysis information */
847:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
848:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
849:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
850:                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
851:   PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));

853:   /* perform the solve analysis */
854:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
855:                                               loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

857:   PetscCallHIP(WaitForHIP());
858:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

860:   /* assign the pointer */
861:   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

863:   /*********************************************/
864:   /* Now the Transpose of the Upper Tri Factor */
865:   /*********************************************/

867:   /* allocate space for the transpose of the upper triangular factor */
868:   PetscCall(PetscNew(&upTriFactorT));
869:   upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;

871:   /* set the matrix descriptors of the upper triangular factor */
872:   matrixType = hipsparseGetMatType(upTriFactor->descr);
873:   indexBase  = hipsparseGetMatIndexBase(upTriFactor->descr);
874:   fillMode   = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
875:   diagType   = hipsparseGetMatDiagType(upTriFactor->descr);

877:   /* Create the matrix description */
878:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
879:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
880:   PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
881:   PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
882:   PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));

884:   /* set the operation */
885:   upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;

887:   /* allocate GPU space for the CSC of the upper triangular factor*/
888:   upTriFactorT->csrMat                 = new CsrMatrix;
889:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
890:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
891:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
892:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
893:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
894:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

896:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
897:   /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
898: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
899:   PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
900:                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
901:                                                   upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
902:   PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
903: #endif
904: */
905:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
906:   PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
907:                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
908: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
909:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
910:                           hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
911: #else
912:                                        upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
913: #endif

915:   PetscCallHIP(WaitForHIP());
916:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));

918:   /* Create the solve analysis information */
919:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
920:   PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
921:   PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
922:                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
923:   PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));

925:   /* perform the solve analysis */
926:   PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
927:                                               upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

929:   PetscCallHIP(WaitForHIP());
930:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));

932:   /* assign the pointer */
933:   ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
934:   PetscFunctionReturn(PETSC_SUCCESS);
935: }

937: struct PetscScalarToPetscInt {
938:   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
939: };

941: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
942: {
943:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
944:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
945:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
946:   hipsparseIndexBase_t           indexBase;

948:   PetscFunctionBegin;
949:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
950:   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
951:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
952:   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
953:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
954:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
955:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
956:   PetscCall(PetscLogGpuTimeBegin());
957:   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
958:   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
959:     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
960:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
961:     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
962:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
963:     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

965:     /* set alpha and beta */
966:     PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
967:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
968:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
969:     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
970:     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
971:     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

973:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
974:       CsrMatrix *matrixT      = new CsrMatrix;
975:       matstructT->mat         = matrixT;
976:       matrixT->num_rows       = A->cmap->n;
977:       matrixT->num_cols       = A->rmap->n;
978:       matrixT->num_entries    = a->nz;
979:       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
980:       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
981:       matrixT->values         = new THRUSTARRAY(a->nz);

983:       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
984:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);

986:       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
987:                                             indexBase, hipsparse_scalartype));
988:     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
989:       CsrMatrix *temp  = new CsrMatrix;
990:       CsrMatrix *tempT = new CsrMatrix;
991:       /* First convert HYB to CSR */
992:       temp->num_rows       = A->rmap->n;
993:       temp->num_cols       = A->cmap->n;
994:       temp->num_entries    = a->nz;
995:       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
996:       temp->column_indices = new THRUSTINTARRAY32(a->nz);
997:       temp->values         = new THRUSTARRAY(a->nz);

999:       PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));

1001:       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1002:       tempT->num_rows       = A->rmap->n;
1003:       tempT->num_cols       = A->cmap->n;
1004:       tempT->num_entries    = a->nz;
1005:       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1006:       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1007:       tempT->values         = new THRUSTARRAY(a->nz);

1009:       PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1010:                                            tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));

1012:       /* Last, convert CSC to HYB */
1013:       hipsparseHybMat_t hybMat;
1014:       PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1015:       hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1016:       PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));

1018:       /* assign the pointer */
1019:       matstructT->mat = hybMat;
1020:       A->transupdated = PETSC_TRUE;
1021:       /* delete temporaries */
1022:       if (tempT) {
1023:         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1024:         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1025:         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1026:         delete (CsrMatrix *)tempT;
1027:       }
1028:       if (temp) {
1029:         if (temp->values) delete (THRUSTARRAY *)temp->values;
1030:         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1031:         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1032:         delete (CsrMatrix *)temp;
1033:       }
1034:     }
1035:   }
1036:   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1037:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1038:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1039:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1040:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1041:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1042:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1043:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1044:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1045:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1046:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1047:     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1048:       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1049:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1050:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1051:     }
1052:     if (!hipsparsestruct->csr2csc_i) {
1053:       THRUSTARRAY csr2csc_a(matrix->num_entries);
1054:       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));

1056:       indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1057:       if (matrix->num_entries) {
1058:         /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1059:            Need to verify this for ROCm.
1060:         */
1061:         PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1062:                                              matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1063:       } else {
1064:         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1065:       }

1067:       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1068:       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1069:     }
1070:     PetscCallThrust(
1071:       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1072:   }
1073:   PetscCall(PetscLogGpuTimeEnd());
1074:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1075:   /* the compressed row indices is not used for matTranspose */
1076:   matstructT->cprowIndices = NULL;
1077:   /* assign the pointer */
1078:   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1079:   A->transupdated                                 = PETSC_TRUE;
1080:   PetscFunctionReturn(PETSC_SUCCESS);
1081: }

1083: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1084: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1085: {
1086:   PetscInt                              n = xx->map->n;
1087:   const PetscScalar                    *barray;
1088:   PetscScalar                          *xarray;
1089:   thrust::device_ptr<const PetscScalar> bGPU;
1090:   thrust::device_ptr<PetscScalar>       xGPU;
1091:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1092:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1093:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1094:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1096:   PetscFunctionBegin;
1097:   /* Analyze the matrix and create the transpose ... on the fly */
1098:   if (!loTriFactorT && !upTriFactorT) {
1099:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1100:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1101:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1102:   }

1104:   /* Get the GPU pointers */
1105:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1106:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1107:   xGPU = thrust::device_pointer_cast(xarray);
1108:   bGPU = thrust::device_pointer_cast(barray);

1110:   PetscCall(PetscLogGpuTimeBegin());
1111:   /* First, reorder with the row permutation */
1112:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);

1114:   /* First, solve U */
1115:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1116:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1118:   /* Then, solve L */
1119:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1120:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1122:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());

1125:   /* Copy the temporary to the full solution. */
1126:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);

1128:   /* restore */
1129:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1130:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1131:   PetscCall(PetscLogGpuTimeEnd());
1132:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1133:   PetscFunctionReturn(PETSC_SUCCESS);
1134: }

1136: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1137: {
1138:   const PetscScalar                  *barray;
1139:   PetscScalar                        *xarray;
1140:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1141:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1142:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT        = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1143:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1145:   PetscFunctionBegin;
1146:   /* Analyze the matrix and create the transpose ... on the fly */
1147:   if (!loTriFactorT && !upTriFactorT) {
1148:     PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1149:     loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1150:     upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1151:   }

1153:   /* Get the GPU pointers */
1154:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1155:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1157:   PetscCall(PetscLogGpuTimeBegin());
1158:   /* First, solve U */
1159:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1160:                                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));

1162:   /* Then, solve L */
1163:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1164:                                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));

1166:   /* restore */
1167:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1168:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1169:   PetscCall(PetscLogGpuTimeEnd());
1170:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1171:   PetscFunctionReturn(PETSC_SUCCESS);
1172: }

1174: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1175: {
1176:   const PetscScalar                    *barray;
1177:   PetscScalar                          *xarray;
1178:   thrust::device_ptr<const PetscScalar> bGPU;
1179:   thrust::device_ptr<PetscScalar>       xGPU;
1180:   Mat_SeqAIJHIPSPARSETriFactors        *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1181:   Mat_SeqAIJHIPSPARSETriFactorStruct   *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1182:   Mat_SeqAIJHIPSPARSETriFactorStruct   *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1183:   THRUSTARRAY                          *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1185:   PetscFunctionBegin;
1186:   /* Get the GPU pointers */
1187:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1188:   PetscCall(VecHIPGetArrayRead(bb, &barray));
1189:   xGPU = thrust::device_pointer_cast(xarray);
1190:   bGPU = thrust::device_pointer_cast(barray);

1192:   PetscCall(PetscLogGpuTimeBegin());
1193:   /* First, reorder with the row permutation */
1194:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());

1196:   /* Next, solve L */
1197:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1198:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1200:   /* Then, solve U */
1201:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1202:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1204:   /* Last, reorder with the column permutation */
1205:   thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);

1207:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1208:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1209:   PetscCall(PetscLogGpuTimeEnd());
1210:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1211:   PetscFunctionReturn(PETSC_SUCCESS);
1212: }

1214: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1215: {
1216:   const PetscScalar                  *barray;
1217:   PetscScalar                        *xarray;
1218:   Mat_SeqAIJHIPSPARSETriFactors      *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1219:   Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1220:   Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor         = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1221:   THRUSTARRAY                        *tempGPU             = (THRUSTARRAY *)hipsparseTriFactors->workVector;

1223:   PetscFunctionBegin;
1224:   /* Get the GPU pointers */
1225:   PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1226:   PetscCall(VecHIPGetArrayRead(bb, &barray));

1228:   PetscCall(PetscLogGpuTimeBegin());
1229:   /* First, solve L */
1230:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1231:                                            loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));

1233:   /* Next, solve U */
1234:   PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1235:                                            upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));

1237:   PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1238:   PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1239:   PetscCall(PetscLogGpuTimeEnd());
1240:   PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1241:   PetscFunctionReturn(PETSC_SUCCESS);
1242: }

1244: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1245: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1246: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1247: {
1248:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1249:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1250:   const PetscScalar             *barray;
1251:   PetscScalar                   *xarray;

1253:   PetscFunctionBegin;
1254:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1255:   PetscCall(VecHIPGetArrayRead(b, &barray));
1256:   PetscCall(PetscLogGpuTimeBegin());

1258:   /* Solve L*y = b */
1259:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1260:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1261:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1262:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                   /* L Y = X */
1263:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1264:   #else
1265:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L,                                     /* L Y = X */
1266:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1267:   #endif
1268:   /* Solve U*x = y */
1269:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1270:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1271:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1272:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1273:   #else
1274:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1275:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1276:   #endif
1277:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1278:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1280:   PetscCall(PetscLogGpuTimeEnd());
1281:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1282:   PetscFunctionReturn(PETSC_SUCCESS);
1283: }

1285: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1286: {
1287:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1288:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1289:   const PetscScalar             *barray;
1290:   PetscScalar                   *xarray;

1292:   PetscFunctionBegin;
1293:   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1294:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1295:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1296:                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1298:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1299:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1300:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1301:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1302:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1303:   }

1305:   if (!fs->updatedTransposeSpSVAnalysis) {
1306:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1308:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1309:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1310:   }

1312:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1313:   PetscCall(VecHIPGetArrayRead(b, &barray));
1314:   PetscCall(PetscLogGpuTimeBegin());

1316:   /* Solve Ut*y = b */
1317:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1318:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1319:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1320:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1321:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1322:   #else
1323:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1324:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1325:   #endif
1326:   /* Solve Lt*x = y */
1327:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1328:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1329:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1330:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1331:   #else
1332:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1333:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1334:   #endif
1335:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1336:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1337:   PetscCall(PetscLogGpuTimeEnd());
1338:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1339:   PetscFunctionReturn(PETSC_SUCCESS);
1340: }

1342: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1343: {
1344:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1345:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1346:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1347:   CsrMatrix                     *Acsr;
1348:   PetscInt                       m, nz;
1349:   PetscBool                      flg;

1351:   PetscFunctionBegin;
1352:   if (PetscDefined(USE_DEBUG)) {
1353:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1354:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1355:   }

1357:   /* Copy A's value to fact */
1358:   m  = fact->rmap->n;
1359:   nz = aij->nz;
1360:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1361:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1362:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1364:   /* Factorize fact inplace */
1365:   if (m)
1366:     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1367:                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1368:   if (PetscDefined(USE_DEBUG)) {
1369:     int               numerical_zero;
1370:     hipsparseStatus_t status;
1371:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1372:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1373:   }

1375:   /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1376:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1378:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

1380:   /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1381:   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;

1383:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1384:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ILU0;
1385:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1386:   fact->ops->matsolve          = NULL;
1387:   fact->ops->matsolvetranspose = NULL;
1388:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1389:   PetscFunctionReturn(PETSC_SUCCESS);
1390: }

1392: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1393: {
1394:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1395:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1396:   PetscInt                       m, nz;

1398:   PetscFunctionBegin;
1399:   if (PetscDefined(USE_DEBUG)) {
1400:     PetscInt  i;
1401:     PetscBool flg, missing;

1403:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1407:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1408:   }

1410:   /* Free the old stale stuff */
1411:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1413:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414:      but they will not be used. Allocate them just for easy debugging.
1415:    */
1416:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1418:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1419:   fact->factortype             = MAT_FACTOR_ILU;
1420:   fact->info.factor_mallocs    = 0;
1421:   fact->info.fill_ratio_given  = info->fill;
1422:   fact->info.fill_ratio_needed = 1.0;

1424:   aij->row = NULL;
1425:   aij->col = NULL;

1427:   /* ====================================================================== */
1428:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1429:   /* We'll do in-place factorization on fact                                */
1430:   /* ====================================================================== */
1431:   const int *Ai, *Aj;

1433:   m  = fact->rmap->n;
1434:   nz = aij->nz;

1436:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1443:   /* ====================================================================== */
1444:   /* Create descriptors for M, L, U                                         */
1445:   /* ====================================================================== */
1446:   hipsparseFillMode_t fillMode;
1447:   hipsparseDiagType_t diagType;

1449:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1453:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458:   */
1459:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460:   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1465:   fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1471:   /* ========================================================================= */
1472:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1473:   /* ========================================================================= */
1474:   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475:   if (m)
1476:     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477:                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));

1479:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1482:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1485:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1488:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

1491:   /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493:    */
1494:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496:     fs->spsvBuffer_L = fs->factBuffer_M;
1497:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498:   } else {
1499:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500:     fs->spsvBuffer_U = fs->factBuffer_M;
1501:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502:   }

1504:   /* ========================================================================== */
1505:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1506:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507:   /* ========================================================================== */
1508:   int structural_zero;

1510:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511:   if (m)
1512:     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514:   if (PetscDefined(USE_DEBUG)) {
1515:     /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516:     hipsparseStatus_t status;
1517:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519:   }

1521:   /* Estimate FLOPs of the numeric factorization */
1522:   {
1523:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1524:     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1525:     PetscLogDouble flops = 0.0;

1527:     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1528:     Ai    = Aseq->i;
1529:     Adiag = Aseq->diag;
1530:     for (PetscInt i = 0; i < m; i++) {
1531:       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532:         nzRow  = Ai[i + 1] - Ai[i];
1533:         nzLeft = Adiag[i] - Ai[i];
1534:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536:         */
1537:         nzLeft = (nzRow - 1) / 2;
1538:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539:       }
1540:     }
1541:     fs->numericFactFlops = flops;
1542:   }
1543:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544:   PetscFunctionReturn(PETSC_SUCCESS);
1545: }

1547: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548: {
1549:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1551:   const PetscScalar             *barray;
1552:   PetscScalar                   *xarray;

1554:   PetscFunctionBegin;
1555:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556:   PetscCall(VecHIPGetArrayRead(b, &barray));
1557:   PetscCall(PetscLogGpuTimeBegin());

1559:   /* Solve L*y = b */
1560:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1563:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565:   #else
1566:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568:   #endif
1569:   /* Solve Lt*x = y */
1570:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571:   #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1572:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574:   #else
1575:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577:   #endif
1578:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1581:   PetscCall(PetscLogGpuTimeEnd());
1582:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583:   PetscFunctionReturn(PETSC_SUCCESS);
1584: }

1586: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587: {
1588:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1590:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591:   CsrMatrix                     *Acsr;
1592:   PetscInt                       m, nz;
1593:   PetscBool                      flg;

1595:   PetscFunctionBegin;
1596:   if (PetscDefined(USE_DEBUG)) {
1597:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599:   }

1601:   /* Copy A's value to fact */
1602:   m  = fact->rmap->n;
1603:   nz = aij->nz;
1604:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1606:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1608:   /* Factorize fact inplace */
1609:   /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610:      The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613:    */
1614:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615:   if (PetscDefined(USE_DEBUG)) {
1616:     int               numerical_zero;
1617:     hipsparseStatus_t status;
1618:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620:   }

1622:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1624:   /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625:     ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626:   */
1627:   PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

1629:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1630:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631:   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632:   fact->ops->matsolve          = NULL;
1633:   fact->ops->matsolvetranspose = NULL;
1634:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635:   PetscFunctionReturn(PETSC_SUCCESS);
1636: }

1638: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639: {
1640:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1642:   PetscInt                       m, nz;

1644:   PetscFunctionBegin;
1645:   if (PetscDefined(USE_DEBUG)) {
1646:     PetscInt  i;
1647:     PetscBool flg, missing;

1649:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1650:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1651:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1652:     PetscCall(MatMissingDiagonal(A, &missing, &i));
1653:     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1654:   }

1656:   /* Free the old stale stuff */
1657:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1659:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1660:      but they will not be used. Allocate them just for easy debugging.
1661:    */
1662:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1664:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1665:   fact->factortype             = MAT_FACTOR_ICC;
1666:   fact->info.factor_mallocs    = 0;
1667:   fact->info.fill_ratio_given  = info->fill;
1668:   fact->info.fill_ratio_needed = 1.0;

1670:   aij->row = NULL;
1671:   aij->col = NULL;

1673:   /* ====================================================================== */
1674:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1675:   /* We'll do in-place factorization on fact                                */
1676:   /* ====================================================================== */
1677:   const int *Ai, *Aj;

1679:   m  = fact->rmap->n;
1680:   nz = aij->nz;

1682:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1683:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1684:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1685:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1686:   PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687:   PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1689:   /* ====================================================================== */
1690:   /* Create mat descriptors for M, L                                        */
1691:   /* ====================================================================== */
1692:   hipsparseFillMode_t fillMode;
1693:   hipsparseDiagType_t diagType;

1695:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1696:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1697:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1699:   /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1700:     hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1701:     assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1702:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1703:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1704:   */
1705:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1706:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1707:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1708:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1709:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1711:   /* ========================================================================= */
1712:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1713:   /* ========================================================================= */
1714:   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1715:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));

1717:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1718:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1720:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1721:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1723:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1724:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1726:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1727:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1729:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1730:      See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1731:    */
1732:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1733:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1734:     fs->spsvBuffer_L = fs->factBuffer_M;
1735:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1736:   } else {
1737:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1738:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1739:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1740:   }

1742:   /* ========================================================================== */
1743:   /* Perform analysis of ic0 on M                                               */
1744:   /* The lower triangular part of M has the same sparsity pattern as L          */
1745:   /* ========================================================================== */
1746:   int structural_zero;

1748:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1749:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1750:   if (PetscDefined(USE_DEBUG)) {
1751:     hipsparseStatus_t status;
1752:     /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1753:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1754:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1755:   }

1757:   /* Estimate FLOPs of the numeric factorization */
1758:   {
1759:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1760:     PetscInt      *Ai, nzRow, nzLeft;
1761:     PetscLogDouble flops = 0.0;

1763:     Ai = Aseq->i;
1764:     for (PetscInt i = 0; i < m; i++) {
1765:       nzRow = Ai[i + 1] - Ai[i];
1766:       if (nzRow > 1) {
1767:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1768:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1769:         */
1770:         nzLeft = (nzRow - 1) / 2;
1771:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1772:       }
1773:     }
1774:     fs->numericFactFlops = flops;
1775:   }
1776:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1777:   PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1779: #endif

1781: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1782: {
1783:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1785:   PetscFunctionBegin;
1786: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1787:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1788:   if (hipsparseTriFactors->factorizeOnDevice) {
1789:     PetscCall(ISIdentity(isrow, &row_identity));
1790:     PetscCall(ISIdentity(iscol, &col_identity));
1791:   }
1792:   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1793:   else
1794: #endif
1795:   {
1796:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1797:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1798:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1799:   }
1800:   PetscFunctionReturn(PETSC_SUCCESS);
1801: }

1803: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1804: {
1805:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1807:   PetscFunctionBegin;
1808:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1809:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1810:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1811:   PetscFunctionReturn(PETSC_SUCCESS);
1812: }

1814: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1815: {
1816:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1818:   PetscFunctionBegin;
1819: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1820:   PetscBool perm_identity = PETSC_FALSE;
1821:   if (hipsparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1822:   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1823:   else
1824: #endif
1825:   {
1826:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1827:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1828:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1829:   }
1830:   PetscFunctionReturn(PETSC_SUCCESS);
1831: }

1833: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1834: {
1835:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1837:   PetscFunctionBegin;
1838:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1839:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1840:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1841:   PetscFunctionReturn(PETSC_SUCCESS);
1842: }

1844: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1845: {
1846:   PetscFunctionBegin;
1847:   *type = MATSOLVERHIPSPARSE;
1848:   PetscFunctionReturn(PETSC_SUCCESS);
1849: }

1851: /*MC
1852:   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1853:   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1854:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1855:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1856:   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1857:   algorithms are not recommended. This class does NOT support direct solver operations.

1859:   Level: beginner

1861: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1862: M*/

1864: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1865: {
1866:   PetscInt  n = A->rmap->n;
1867:   PetscBool factOnDevice, factOnHost;
1868:   char     *prefix;
1869:   char      factPlace[32] = "device"; /* the default */

1871:   PetscFunctionBegin;
1872:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1873:   PetscCall(MatSetSizes(*B, n, n, n, n));
1874:   (*B)->factortype = ftype;
1875:   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));

1877:   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1878:   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
1879:   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1880:   PetscOptionsEnd();
1881:   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1882:   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1883:   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1884:   ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;

1886:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1887:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1888:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1889:     if (!A->boundtocpu) {
1890:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1891:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1892:     } else {
1893:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1894:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1895:     }
1896:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1897:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1898:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1899:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1900:     if (!A->boundtocpu) {
1901:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1902:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1903:     } else {
1904:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1905:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1906:     }
1907:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1908:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1909:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");

1911:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1912:   (*B)->canuseordering = PETSC_TRUE;
1913:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1914:   PetscFunctionReturn(PETSC_SUCCESS);
1915: }

1917: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1918: {
1919:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
1920:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1921: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1922:   Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1923: #endif

1925:   PetscFunctionBegin;
1926:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1927:     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1928:     if (A->factortype == MAT_FACTOR_NONE) {
1929:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1930:       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1931:     }
1932: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1933:     else if (fs->csrVal) {
1934:       /* We have a factorized matrix on device and are able to copy it to host */
1935:       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1936:     }
1937: #endif
1938:     else
1939:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1940:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1941:     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1942:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1943:   }
1944:   PetscFunctionReturn(PETSC_SUCCESS);
1945: }

1947: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1948: {
1949:   PetscFunctionBegin;
1950:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1951:   *array = ((Mat_SeqAIJ *)A->data)->a;
1952:   PetscFunctionReturn(PETSC_SUCCESS);
1953: }

1955: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1956: {
1957:   PetscFunctionBegin;
1958:   A->offloadmask = PETSC_OFFLOAD_CPU;
1959:   *array         = NULL;
1960:   PetscFunctionReturn(PETSC_SUCCESS);
1961: }

1963: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1964: {
1965:   PetscFunctionBegin;
1966:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1967:   *array = ((Mat_SeqAIJ *)A->data)->a;
1968:   PetscFunctionReturn(PETSC_SUCCESS);
1969: }

1971: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1972: {
1973:   PetscFunctionBegin;
1974:   *array = NULL;
1975:   PetscFunctionReturn(PETSC_SUCCESS);
1976: }

1978: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1979: {
1980:   PetscFunctionBegin;
1981:   *array = ((Mat_SeqAIJ *)A->data)->a;
1982:   PetscFunctionReturn(PETSC_SUCCESS);
1983: }

1985: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1986: {
1987:   PetscFunctionBegin;
1988:   A->offloadmask = PETSC_OFFLOAD_CPU;
1989:   *array         = NULL;
1990:   PetscFunctionReturn(PETSC_SUCCESS);
1991: }

1993: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1994: {
1995:   Mat_SeqAIJHIPSPARSE *cusp;
1996:   CsrMatrix           *matrix;

1998:   PetscFunctionBegin;
1999:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2000:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2001:   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
2002:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2003:   matrix = (CsrMatrix *)cusp->mat->mat;

2005:   if (i) {
2006: #if !defined(PETSC_USE_64BIT_INDICES)
2007:     *i = matrix->row_offsets->data().get();
2008: #else
2009:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2010: #endif
2011:   }
2012:   if (j) {
2013: #if !defined(PETSC_USE_64BIT_INDICES)
2014:     *j = matrix->column_indices->data().get();
2015: #else
2016:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2017: #endif
2018:   }
2019:   if (a) *a = matrix->values->data().get();
2020:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2021:   PetscFunctionReturn(PETSC_SUCCESS);
2022: }

2024: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2025: {
2026:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2027:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
2028:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2029:   PetscBool                      both            = PETSC_TRUE;
2030:   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;

2032:   PetscFunctionBegin;
2033:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2034:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2035:     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2036:       CsrMatrix *matrix;
2037:       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;

2039:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2040:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2041:       matrix->values->assign(a->a, a->a + a->nz);
2042:       PetscCallHIP(WaitForHIP());
2043:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2044:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2045:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2046:     } else {
2047:       PetscInt nnz;
2048:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2049:       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2050:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2051:       delete hipsparsestruct->workVector;
2052:       delete hipsparsestruct->rowoffsets_gpu;
2053:       hipsparsestruct->workVector     = NULL;
2054:       hipsparsestruct->rowoffsets_gpu = NULL;
2055:       try {
2056:         if (a->compressedrow.use) {
2057:           m    = a->compressedrow.nrows;
2058:           ii   = a->compressedrow.i;
2059:           ridx = a->compressedrow.rindex;
2060:         } else {
2061:           m    = A->rmap->n;
2062:           ii   = a->i;
2063:           ridx = NULL;
2064:         }
2065:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2066:         if (!a->a) {
2067:           nnz  = ii[m];
2068:           both = PETSC_FALSE;
2069:         } else nnz = a->nz;
2070:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

2072:         /* create hipsparse matrix */
2073:         hipsparsestruct->nrows = m;
2074:         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
2075:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2076:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2077:         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

2079:         PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2080:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2081:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2082:         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2083:         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2084:         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2085:         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));

2087:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2088:         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2089:           /* set the matrix */
2090:           CsrMatrix *mat      = new CsrMatrix;
2091:           mat->num_rows       = m;
2092:           mat->num_cols       = A->cmap->n;
2093:           mat->num_entries    = nnz;
2094:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2095:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2096:           mat->values         = new THRUSTARRAY(nnz);
2097:           mat->row_offsets->assign(ii, ii + m + 1);
2098:           mat->column_indices->assign(a->j, a->j + nnz);
2099:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2101:           /* assign the pointer */
2102:           matstruct->mat = mat;
2103:           if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2104:             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2105:                                                   HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2106:           }
2107:         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2108:           CsrMatrix *mat      = new CsrMatrix;
2109:           mat->num_rows       = m;
2110:           mat->num_cols       = A->cmap->n;
2111:           mat->num_entries    = nnz;
2112:           mat->row_offsets    = new THRUSTINTARRAY32(m + 1);
2113:           mat->column_indices = new THRUSTINTARRAY32(nnz);
2114:           mat->values         = new THRUSTARRAY(nnz);
2115:           mat->row_offsets->assign(ii, ii + m + 1);
2116:           mat->column_indices->assign(a->j, a->j + nnz);
2117:           if (a->a) mat->values->assign(a->a, a->a + nnz);

2119:           hipsparseHybMat_t hybMat;
2120:           PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2121:           hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2122:           PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2123:           /* assign the pointer */
2124:           matstruct->mat = hybMat;

2126:           if (mat) {
2127:             if (mat->values) delete (THRUSTARRAY *)mat->values;
2128:             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2129:             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2130:             delete (CsrMatrix *)mat;
2131:           }
2132:         }

2134:         /* assign the compressed row indices */
2135:         if (a->compressedrow.use) {
2136:           hipsparsestruct->workVector = new THRUSTARRAY(m);
2137:           matstruct->cprowIndices     = new THRUSTINTARRAY(m);
2138:           matstruct->cprowIndices->assign(ridx, ridx + m);
2139:           tmp = m;
2140:         } else {
2141:           hipsparsestruct->workVector = NULL;
2142:           matstruct->cprowIndices     = NULL;
2143:           tmp                         = 0;
2144:         }
2145:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

2147:         /* assign the pointer */
2148:         hipsparsestruct->mat = matstruct;
2149:       } catch (char *ex) {
2150:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2151:       }
2152:       PetscCallHIP(WaitForHIP());
2153:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2154:       hipsparsestruct->nonzerostate = A->nonzerostate;
2155:     }
2156:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2157:   }
2158:   PetscFunctionReturn(PETSC_SUCCESS);
2159: }

2161: struct VecHIPPlusEquals {
2162:   template <typename Tuple>
2163:   __host__ __device__ void operator()(Tuple t)
2164:   {
2165:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2166:   }
2167: };

2169: struct VecHIPEquals {
2170:   template <typename Tuple>
2171:   __host__ __device__ void operator()(Tuple t)
2172:   {
2173:     thrust::get<1>(t) = thrust::get<0>(t);
2174:   }
2175: };

2177: struct VecHIPEqualsReverse {
2178:   template <typename Tuple>
2179:   __host__ __device__ void operator()(Tuple t)
2180:   {
2181:     thrust::get<0>(t) = thrust::get<1>(t);
2182:   }
2183: };

2185: struct MatMatHipsparse {
2186:   PetscBool             cisdense;
2187:   PetscScalar          *Bt;
2188:   Mat                   X;
2189:   PetscBool             reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2190:   PetscLogDouble        flops;
2191:   CsrMatrix            *Bcsr;
2192:   hipsparseSpMatDescr_t matSpBDescr;
2193:   PetscBool             initialized; /* C = alpha op(A) op(B) + beta C */
2194:   hipsparseDnMatDescr_t matBDescr;
2195:   hipsparseDnMatDescr_t matCDescr;
2196:   PetscInt              Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2197: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2198:   void *dBuffer4, *dBuffer5;
2199: #endif
2200:   size_t                 mmBufferSize;
2201:   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2202:   hipsparseSpGEMMDescr_t spgemmDesc;
2203: };

2205: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2206: {
2207:   MatMatHipsparse *mmdata = (MatMatHipsparse *)data;

2209:   PetscFunctionBegin;
2210:   PetscCallHIP(hipFree(mmdata->Bt));
2211:   delete mmdata->Bcsr;
2212:   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2213:   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2214:   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2215:   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2216: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2217:   if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2218:   if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2219: #endif
2220:   if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2221:   if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2222:   PetscCall(MatDestroy(&mmdata->X));
2223:   PetscCall(PetscFree(data));
2224:   PetscFunctionReturn(PETSC_SUCCESS);
2225: }

2227: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2228: {
2229:   Mat_Product                   *product = C->product;
2230:   Mat                            A, B;
2231:   PetscInt                       m, n, blda, clda;
2232:   PetscBool                      flg, biship;
2233:   Mat_SeqAIJHIPSPARSE           *cusp;
2234:   hipsparseOperation_t           opA;
2235:   const PetscScalar             *barray;
2236:   PetscScalar                   *carray;
2237:   MatMatHipsparse               *mmdata;
2238:   Mat_SeqAIJHIPSPARSEMultStruct *mat;
2239:   CsrMatrix                     *csrmat;

2241:   PetscFunctionBegin;
2242:   MatCheckProduct(C, 1);
2243:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2244:   mmdata = (MatMatHipsparse *)product->data;
2245:   A      = product->A;
2246:   B      = product->B;
2247:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2248:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2249:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2250:      Instead of silently accepting the wrong answer, I prefer to raise the error */
2251:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2253:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2254:   switch (product->type) {
2255:   case MATPRODUCT_AB:
2256:   case MATPRODUCT_PtAP:
2257:     mat = cusp->mat;
2258:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2259:     m   = A->rmap->n;
2260:     n   = B->cmap->n;
2261:     break;
2262:   case MATPRODUCT_AtB:
2263:     if (!A->form_explicit_transpose) {
2264:       mat = cusp->mat;
2265:       opA = HIPSPARSE_OPERATION_TRANSPOSE;
2266:     } else {
2267:       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2268:       mat = cusp->matTranspose;
2269:       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2270:     }
2271:     m = A->cmap->n;
2272:     n = B->cmap->n;
2273:     break;
2274:   case MATPRODUCT_ABt:
2275:   case MATPRODUCT_RARt:
2276:     mat = cusp->mat;
2277:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2278:     m   = A->rmap->n;
2279:     n   = B->rmap->n;
2280:     break;
2281:   default:
2282:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2283:   }
2284:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2285:   csrmat = (CsrMatrix *)mat->mat;
2286:   /* if the user passed a CPU matrix, copy the data to the GPU */
2287:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2288:   if (!biship) { PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); }
2289:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2290:   PetscCall(MatDenseGetLDA(B, &blda));
2291:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2292:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2293:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2294:   } else {
2295:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2296:     PetscCall(MatDenseGetLDA(C, &clda));
2297:   }

2299:   PetscCall(PetscLogGpuTimeBegin());
2300:   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2301:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2302:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2303:     size_t mmBufferSize;
2304:     if (mmdata->initialized && mmdata->Blda != blda) {
2305:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2306:       mmdata->matBDescr = NULL;
2307:     }
2308:     if (!mmdata->matBDescr) {
2309:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2310:       mmdata->Blda = blda;
2311:     }
2312:     if (mmdata->initialized && mmdata->Clda != clda) {
2313:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2314:       mmdata->matCDescr = NULL;
2315:     }
2316:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2317:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2318:       mmdata->Clda = clda;
2319:     }
2320:     if (!mat->matDescr) {
2321:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2322:                                             HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2323:     }
2324:     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2325:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2326:       PetscCallHIP(hipFree(mmdata->mmBuffer));
2327:       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2328:       mmdata->mmBufferSize = mmBufferSize;
2329:     }
2330:     mmdata->initialized = PETSC_TRUE;
2331:   } else {
2332:     /* to be safe, always update pointers of the mats */
2333:     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2334:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2335:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2336:   }

2338:   /* do hipsparseSpMM, which supports transpose on B */
2339:   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

2341:   PetscCall(PetscLogGpuTimeEnd());
2342:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2343:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2344:   if (product->type == MATPRODUCT_RARt) {
2345:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2346:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2347:   } else if (product->type == MATPRODUCT_PtAP) {
2348:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2349:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2350:   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2351:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2352:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2353:   PetscFunctionReturn(PETSC_SUCCESS);
2354: }

2356: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2357: {
2358:   Mat_Product         *product = C->product;
2359:   Mat                  A, B;
2360:   PetscInt             m, n;
2361:   PetscBool            cisdense, flg;
2362:   MatMatHipsparse     *mmdata;
2363:   Mat_SeqAIJHIPSPARSE *cusp;

2365:   PetscFunctionBegin;
2366:   MatCheckProduct(C, 1);
2367:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2368:   A = product->A;
2369:   B = product->B;
2370:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2371:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2372:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2373:   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2374:   switch (product->type) {
2375:   case MATPRODUCT_AB:
2376:     m = A->rmap->n;
2377:     n = B->cmap->n;
2378:     break;
2379:   case MATPRODUCT_AtB:
2380:     m = A->cmap->n;
2381:     n = B->cmap->n;
2382:     break;
2383:   case MATPRODUCT_ABt:
2384:     m = A->rmap->n;
2385:     n = B->rmap->n;
2386:     break;
2387:   case MATPRODUCT_PtAP:
2388:     m = B->cmap->n;
2389:     n = B->cmap->n;
2390:     break;
2391:   case MATPRODUCT_RARt:
2392:     m = B->rmap->n;
2393:     n = B->rmap->n;
2394:     break;
2395:   default:
2396:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2397:   }
2398:   PetscCall(MatSetSizes(C, m, n, m, n));
2399:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2400:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2401:   PetscCall(MatSetType(C, MATSEQDENSEHIP));

2403:   /* product data */
2404:   PetscCall(PetscNew(&mmdata));
2405:   mmdata->cisdense = cisdense;
2406:   /* for these products we need intermediate storage */
2407:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2408:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2409:     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2410:     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2411:     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2412:     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2413:   }
2414:   C->product->data       = mmdata;
2415:   C->product->destroy    = MatDestroy_MatMatHipsparse;
2416:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2417:   PetscFunctionReturn(PETSC_SUCCESS);
2418: }

2420: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2421: {
2422:   Mat_Product                   *product = C->product;
2423:   Mat                            A, B;
2424:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2425:   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
2426:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2427:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2428:   PetscBool                      flg;
2429:   MatProductType                 ptype;
2430:   MatMatHipsparse               *mmdata;
2431:   hipsparseSpMatDescr_t          BmatSpDescr;
2432:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2434:   PetscFunctionBegin;
2435:   MatCheckProduct(C, 1);
2436:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2437:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2438:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2439:   mmdata = (MatMatHipsparse *)C->product->data;
2440:   A      = product->A;
2441:   B      = product->B;
2442:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2443:     mmdata->reusesym = PETSC_FALSE;
2444:     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2445:     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2446:     Cmat = Ccusp->mat;
2447:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2448:     Ccsr = (CsrMatrix *)Cmat->mat;
2449:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2450:     goto finalize;
2451:   }
2452:   if (!c->nz) goto finalize;
2453:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2454:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2455:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2456:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2457:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2458:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2459:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2460:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2461:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2462:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2463:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2464:   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2465:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2466:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));

2468:   ptype = product->type;
2469:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2470:     ptype = MATPRODUCT_AB;
2471:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2472:   }
2473:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2474:     ptype = MATPRODUCT_AB;
2475:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2476:   }
2477:   switch (ptype) {
2478:   case MATPRODUCT_AB:
2479:     Amat = Acusp->mat;
2480:     Bmat = Bcusp->mat;
2481:     break;
2482:   case MATPRODUCT_AtB:
2483:     Amat = Acusp->matTranspose;
2484:     Bmat = Bcusp->mat;
2485:     break;
2486:   case MATPRODUCT_ABt:
2487:     Amat = Acusp->mat;
2488:     Bmat = Bcusp->matTranspose;
2489:     break;
2490:   default:
2491:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2492:   }
2493:   Cmat = Ccusp->mat;
2494:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2495:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2496:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2497:   Acsr = (CsrMatrix *)Amat->mat;
2498:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2499:   Ccsr = (CsrMatrix *)Cmat->mat;
2500:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2501:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2502:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2503:   PetscCall(PetscLogGpuTimeBegin());
2504: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2505:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2506:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2507:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2508:   PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2509:   #else
2510:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2511:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2512:   #endif
2513: #else
2514:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2515:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2516:                                           Ccsr->column_indices->data().get()));
2517: #endif
2518:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2519:   PetscCallHIP(WaitForHIP());
2520:   PetscCall(PetscLogGpuTimeEnd());
2521:   C->offloadmask = PETSC_OFFLOAD_GPU;
2522: finalize:
2523:   /* shorter version of MatAssemblyEnd_SeqAIJ */
2524:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2525:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2526:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2527:   c->reallocs = 0;
2528:   C->info.mallocs += 0;
2529:   C->info.nz_unneeded = 0;
2530:   C->assembled = C->was_assembled = PETSC_TRUE;
2531:   C->num_ass++;
2532:   PetscFunctionReturn(PETSC_SUCCESS);
2533: }

2535: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2536: {
2537:   Mat_Product                   *product = C->product;
2538:   Mat                            A, B;
2539:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
2540:   Mat_SeqAIJ                    *a, *b, *c;
2541:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2542:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
2543:   PetscInt                       i, j, m, n, k;
2544:   PetscBool                      flg;
2545:   MatProductType                 ptype;
2546:   MatMatHipsparse               *mmdata;
2547:   PetscLogDouble                 flops;
2548:   PetscBool                      biscompressed, ciscompressed;
2549: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2550:   int64_t               C_num_rows1, C_num_cols1, C_nnz1;
2551:   hipsparseSpMatDescr_t BmatSpDescr;
2552: #else
2553:   int cnz;
2554: #endif
2555:   hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

2557:   PetscFunctionBegin;
2558:   MatCheckProduct(C, 1);
2559:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2560:   A = product->A;
2561:   B = product->B;
2562:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2563:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2564:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2565:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2566:   a = (Mat_SeqAIJ *)A->data;
2567:   b = (Mat_SeqAIJ *)B->data;
2568:   /* product data */
2569:   PetscCall(PetscNew(&mmdata));
2570:   C->product->data    = mmdata;
2571:   C->product->destroy = MatDestroy_MatMatHipsparse;

2573:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2574:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2575:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2576:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2577:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2578:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");

2580:   ptype = product->type;
2581:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2582:     ptype                                          = MATPRODUCT_AB;
2583:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2584:   }
2585:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2586:     ptype                                          = MATPRODUCT_AB;
2587:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2588:   }
2589:   biscompressed = PETSC_FALSE;
2590:   ciscompressed = PETSC_FALSE;
2591:   switch (ptype) {
2592:   case MATPRODUCT_AB:
2593:     m    = A->rmap->n;
2594:     n    = B->cmap->n;
2595:     k    = A->cmap->n;
2596:     Amat = Acusp->mat;
2597:     Bmat = Bcusp->mat;
2598:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2599:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2600:     break;
2601:   case MATPRODUCT_AtB:
2602:     m = A->cmap->n;
2603:     n = B->cmap->n;
2604:     k = A->rmap->n;
2605:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2606:     Amat = Acusp->matTranspose;
2607:     Bmat = Bcusp->mat;
2608:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2609:     break;
2610:   case MATPRODUCT_ABt:
2611:     m = A->rmap->n;
2612:     n = B->rmap->n;
2613:     k = A->cmap->n;
2614:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2615:     Amat = Acusp->mat;
2616:     Bmat = Bcusp->matTranspose;
2617:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2618:     break;
2619:   default:
2620:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2621:   }

2623:   /* create hipsparse matrix */
2624:   PetscCall(MatSetSizes(C, m, n, m, n));
2625:   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2626:   c     = (Mat_SeqAIJ *)C->data;
2627:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2628:   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2629:   Ccsr  = new CsrMatrix;

2631:   c->compressedrow.use = ciscompressed;
2632:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2633:     c->compressedrow.nrows = a->compressedrow.nrows;
2634:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2635:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2636:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2637:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2638:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2639:   } else {
2640:     c->compressedrow.nrows  = 0;
2641:     c->compressedrow.i      = NULL;
2642:     c->compressedrow.rindex = NULL;
2643:     Ccusp->workVector       = NULL;
2644:     Cmat->cprowIndices      = NULL;
2645:   }
2646:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2647:   Ccusp->mat        = Cmat;
2648:   Ccusp->mat->mat   = Ccsr;
2649:   Ccsr->num_rows    = Ccusp->nrows;
2650:   Ccsr->num_cols    = n;
2651:   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2652:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2653:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2654:   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2655:   PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2656:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2657:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2658:   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2659:   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2660:   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2661:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2662:     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2663:     c->nz                = 0;
2664:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2665:     Ccsr->values         = new THRUSTARRAY(c->nz);
2666:     goto finalizesym;
2667:   }

2669:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2670:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2671:   Acsr = (CsrMatrix *)Amat->mat;
2672:   if (!biscompressed) {
2673:     Bcsr        = (CsrMatrix *)Bmat->mat;
2674:     BmatSpDescr = Bmat->matDescr;
2675:   } else { /* we need to use row offsets for the full matrix */
2676:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2677:     Bcsr                 = new CsrMatrix;
2678:     Bcsr->num_rows       = B->rmap->n;
2679:     Bcsr->num_cols       = cBcsr->num_cols;
2680:     Bcsr->num_entries    = cBcsr->num_entries;
2681:     Bcsr->column_indices = cBcsr->column_indices;
2682:     Bcsr->values         = cBcsr->values;
2683:     if (!Bcusp->rowoffsets_gpu) {
2684:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2685:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2686:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2687:     }
2688:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2689:     mmdata->Bcsr      = Bcsr;
2690:     if (Bcsr->num_rows && Bcsr->num_cols) {
2691:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2692:     }
2693:     BmatSpDescr = mmdata->matSpBDescr;
2694:   }
2695:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2696:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2697:   /* precompute flops count */
2698:   if (ptype == MATPRODUCT_AB) {
2699:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2700:       const PetscInt st = a->i[i];
2701:       const PetscInt en = a->i[i + 1];
2702:       for (j = st; j < en; j++) {
2703:         const PetscInt brow = a->j[j];
2704:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2705:       }
2706:     }
2707:   } else if (ptype == MATPRODUCT_AtB) {
2708:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2709:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2710:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2711:       flops += (2. * anzi) * bnzi;
2712:     }
2713:   } else flops = 0.; /* TODO */

2715:   mmdata->flops = flops;
2716:   PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2718:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2719:   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2720:   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2721:   #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2722:   {
2723:     /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2724:      We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2725:   */
2726:     void *dBuffer1 = NULL;
2727:     void *dBuffer2 = NULL;
2728:     void *dBuffer3 = NULL;
2729:     /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2730:     size_t bufferSize1 = 0;
2731:     size_t bufferSize2 = 0;
2732:     size_t bufferSize3 = 0;
2733:     size_t bufferSize4 = 0;
2734:     size_t bufferSize5 = 0;

2736:     /* ask bufferSize1 bytes for external memory */
2737:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2738:     PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2739:     /* inspect the matrices A and B to understand the memory requirement for the next step */
2740:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));

2742:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2743:     PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2744:     PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2745:     PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2746:     PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2747:     PetscCallHIP(hipFree(dBuffer1));
2748:     PetscCallHIP(hipFree(dBuffer2));

2750:     /* get matrix C non-zero entries C_nnz1 */
2751:     PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2752:     c->nz = (PetscInt)C_nnz1;
2753:     /* allocate matrix C */
2754:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2755:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2756:     Ccsr->values = new THRUSTARRAY(c->nz);
2757:     PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2758:     /* update matC with the new pointers */
2759:     if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2760:       PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));

2762:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2763:       PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2764:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2765:       PetscCallHIP(hipFree(dBuffer3));
2766:       PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2767:     }
2768:     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2769:   }
2770:   #else
2771:   size_t bufSize2;
2772:   /* ask bufferSize bytes for external memory */
2773:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2774:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2775:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2776:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2777:   /* ask bufferSize again bytes for external memory */
2778:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2779:   /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2780:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2781:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2782:      is stored in the descriptor! What a messy API... */
2783:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2784:   /* compute the intermediate product of A * B */
2785:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2786:   /* get matrix C non-zero entries C_nnz1 */
2787:   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2788:   c->nz = (PetscInt)C_nnz1;
2789:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2790:                       mmdata->mmBufferSize / 1024));
2791:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2792:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2793:   Ccsr->values = new THRUSTARRAY(c->nz);
2794:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2795:   PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2796:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2797:   #endif
2798: #else
2799:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2800:   PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2801:                                           Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2802:   c->nz                = cnz;
2803:   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2804:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2805:   Ccsr->values = new THRUSTARRAY(c->nz);
2806:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */

2808:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2809:   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2810:       I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2811:       D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2812:   PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2813:                                           Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2814:                                           Ccsr->column_indices->data().get()));
2815: #endif
2816:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2817:   PetscCall(PetscLogGpuTimeEnd());
2818: finalizesym:
2819:   c->singlemalloc = PETSC_FALSE;
2820:   c->free_a       = PETSC_TRUE;
2821:   c->free_ij      = PETSC_TRUE;
2822:   PetscCall(PetscMalloc1(m + 1, &c->i));
2823:   PetscCall(PetscMalloc1(c->nz, &c->j));
2824:   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2825:     PetscInt      *d_i = c->i;
2826:     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2827:     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2828:     ii = *Ccsr->row_offsets;
2829:     jj = *Ccsr->column_indices;
2830:     if (ciscompressed) d_i = c->compressedrow.i;
2831:     PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2832:     PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2833:   } else {
2834:     PetscInt *d_i = c->i;
2835:     if (ciscompressed) d_i = c->compressedrow.i;
2836:     PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2837:     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2838:   }
2839:   if (ciscompressed) { /* need to expand host row offsets */
2840:     PetscInt r = 0;
2841:     c->i[0]    = 0;
2842:     for (k = 0; k < c->compressedrow.nrows; k++) {
2843:       const PetscInt next = c->compressedrow.rindex[k];
2844:       const PetscInt old  = c->compressedrow.i[k];
2845:       for (; r < next; r++) c->i[r + 1] = old;
2846:     }
2847:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2848:   }
2849:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2850:   PetscCall(PetscMalloc1(m, &c->ilen));
2851:   PetscCall(PetscMalloc1(m, &c->imax));
2852:   c->maxnz         = c->nz;
2853:   c->nonzerorowcnt = 0;
2854:   c->rmax          = 0;
2855:   for (k = 0; k < m; k++) {
2856:     const PetscInt nn = c->i[k + 1] - c->i[k];
2857:     c->ilen[k] = c->imax[k] = nn;
2858:     c->nonzerorowcnt += (PetscInt) !!nn;
2859:     c->rmax = PetscMax(c->rmax, nn);
2860:   }
2861:   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2862:   PetscCall(PetscMalloc1(c->nz, &c->a));
2863:   Ccsr->num_entries = c->nz;

2865:   C->nonzerostate++;
2866:   PetscCall(PetscLayoutSetUp(C->rmap));
2867:   PetscCall(PetscLayoutSetUp(C->cmap));
2868:   Ccusp->nonzerostate = C->nonzerostate;
2869:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2870:   C->preallocated     = PETSC_TRUE;
2871:   C->assembled        = PETSC_FALSE;
2872:   C->was_assembled    = PETSC_FALSE;
2873:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2874:     mmdata->reusesym = PETSC_TRUE;
2875:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2876:   }
2877:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2878:   PetscFunctionReturn(PETSC_SUCCESS);
2879: }

2881: /* handles sparse or dense B */
2882: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2883: {
2884:   Mat_Product *product = mat->product;
2885:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2887:   PetscFunctionBegin;
2888:   MatCheckProduct(mat, 1);
2889:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2890:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2891:   if (product->type == MATPRODUCT_ABC) {
2892:     Ciscusp = PETSC_FALSE;
2893:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2894:   }
2895:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2896:     PetscBool usecpu = PETSC_FALSE;
2897:     switch (product->type) {
2898:     case MATPRODUCT_AB:
2899:       if (product->api_user) {
2900:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2901:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2902:         PetscOptionsEnd();
2903:       } else {
2904:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2905:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2906:         PetscOptionsEnd();
2907:       }
2908:       break;
2909:     case MATPRODUCT_AtB:
2910:       if (product->api_user) {
2911:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2912:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2913:         PetscOptionsEnd();
2914:       } else {
2915:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2916:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2917:         PetscOptionsEnd();
2918:       }
2919:       break;
2920:     case MATPRODUCT_PtAP:
2921:       if (product->api_user) {
2922:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2923:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2924:         PetscOptionsEnd();
2925:       } else {
2926:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2927:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2928:         PetscOptionsEnd();
2929:       }
2930:       break;
2931:     case MATPRODUCT_RARt:
2932:       if (product->api_user) {
2933:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2934:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2935:         PetscOptionsEnd();
2936:       } else {
2937:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2938:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2939:         PetscOptionsEnd();
2940:       }
2941:       break;
2942:     case MATPRODUCT_ABC:
2943:       if (product->api_user) {
2944:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2945:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2946:         PetscOptionsEnd();
2947:       } else {
2948:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2949:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2950:         PetscOptionsEnd();
2951:       }
2952:       break;
2953:     default:
2954:       break;
2955:     }
2956:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2957:   }
2958:   /* dispatch */
2959:   if (isdense) {
2960:     switch (product->type) {
2961:     case MATPRODUCT_AB:
2962:     case MATPRODUCT_AtB:
2963:     case MATPRODUCT_ABt:
2964:     case MATPRODUCT_PtAP:
2965:     case MATPRODUCT_RARt:
2966:       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2967:       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2968:       break;
2969:     case MATPRODUCT_ABC:
2970:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971:       break;
2972:     default:
2973:       break;
2974:     }
2975:   } else if (Biscusp && Ciscusp) {
2976:     switch (product->type) {
2977:     case MATPRODUCT_AB:
2978:     case MATPRODUCT_AtB:
2979:     case MATPRODUCT_ABt:
2980:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2981:       break;
2982:     case MATPRODUCT_PtAP:
2983:     case MATPRODUCT_RARt:
2984:     case MATPRODUCT_ABC:
2985:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2986:       break;
2987:     default:
2988:       break;
2989:     }
2990:   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2991:   PetscFunctionReturn(PETSC_SUCCESS);
2992: }

2994: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2995: {
2996:   PetscFunctionBegin;
2997:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2998:   PetscFunctionReturn(PETSC_SUCCESS);
2999: }

3001: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3002: {
3003:   PetscFunctionBegin;
3004:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3005:   PetscFunctionReturn(PETSC_SUCCESS);
3006: }

3008: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3009: {
3010:   PetscFunctionBegin;
3011:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3012:   PetscFunctionReturn(PETSC_SUCCESS);
3013: }

3015: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3016: {
3017:   PetscFunctionBegin;
3018:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3019:   PetscFunctionReturn(PETSC_SUCCESS);
3020: }

3022: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3023: {
3024:   PetscFunctionBegin;
3025:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3026:   PetscFunctionReturn(PETSC_SUCCESS);
3027: }

3029: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3030: {
3031:   int i = blockIdx.x * blockDim.x + threadIdx.x;
3032:   if (i < n) y[idx[i]] += x[i];
3033: }

3035: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3036: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3037: {
3038:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
3039:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3040:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3041:   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
3042:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3043:   PetscBool                      compressed;
3044:   PetscInt                       nx, ny;

3046:   PetscFunctionBegin;
3047:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3048:   if (!a->nz) {
3049:     if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3050:     else PetscCall(VecSeq_HIP::Set(zz, 0));
3051:     PetscFunctionReturn(PETSC_SUCCESS);
3052:   }
3053:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3054:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3055:   if (!trans) {
3056:     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3057:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3058:   } else {
3059:     if (herm || !A->form_explicit_transpose) {
3060:       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3061:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3062:     } else {
3063:       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3064:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3065:     }
3066:   }
3067:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3068:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3069:   try {
3070:     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3071:     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3072:     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

3074:     PetscCall(PetscLogGpuTimeBegin());
3075:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3076:       /* z = A x + beta y.
3077:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3078:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3079:       */
3080:       xptr = xarray;
3081:       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3082:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3083:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3084:           allocated to accommodate different uses. So we get the length info directly from mat.
3085:        */
3086:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3087:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3088:         nx             = mat->num_cols;
3089:         ny             = mat->num_rows;
3090:       }
3091:     } else {
3092:       /* z = A^T x + beta y
3093:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3094:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3095:        */
3096:       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3097:       dptr = zarray;
3098:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3099:       if (compressed) { /* Scatter x to work vector */
3100:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3101:         thrust::for_each(
3102: #if PetscDefined(HAVE_THRUST_ASYNC)
3103:           thrust::hip::par.on(PetscDefaultHipStream),
3104: #endif
3105:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3106:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3107:       }
3108:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3109:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3110:         nx             = mat->num_rows;
3111:         ny             = mat->num_cols;
3112:       }
3113:     }
3114:     /* csr_spmv does y = alpha op(A) x + beta y */
3115:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3116: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3117:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3118:       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3119:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3120:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3121:         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3122:                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
3123:         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3124:         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3125:       } else {
3126:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3127:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3128:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3129:       }
3130:       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3131:                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3132: #else
3133:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3134:       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3135: #endif
3136:     } else {
3137:       if (hipsparsestruct->nrows) {
3138:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3139:         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3140:       }
3141:     }
3142:     PetscCall(PetscLogGpuTimeEnd());

3144:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3145:       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
3146:         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3147:           PetscCall(VecSeq_HIP::Copy(yy, zz));      /* zz = yy */
3148:         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3149:           PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3150:         }
3151:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3152:         PetscCall(VecSeq_HIP::Set(zz, 0));
3153:       }

3155:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3156:       if (compressed) {
3157:         PetscCall(PetscLogGpuTimeBegin());
3158:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3159:            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3160:            prevent that. So I just add a ScatterAdd kernel.
3161:          */
3162: #if 0
3163:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3164:         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3165:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3166:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3167:                          VecHIPPlusEquals());
3168: #else
3169:         PetscInt n = matstruct->cprowIndices->size();
3170:         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3171: #endif
3172:         PetscCall(PetscLogGpuTimeEnd());
3173:       }
3174:     } else {
3175:       if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3176:     }
3177:     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3178:     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3179:     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3180:   } catch (char *ex) {
3181:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3182:   }
3183:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3184:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3185:   PetscFunctionReturn(PETSC_SUCCESS);
3186: }

3188: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3189: {
3190:   PetscFunctionBegin;
3191:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3192:   PetscFunctionReturn(PETSC_SUCCESS);
3193: }

3195: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3196: {
3197:   PetscFunctionBegin;
3198:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3199:   PetscFunctionReturn(PETSC_SUCCESS);
3200: }

3202: /*@
3203:   MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3204:   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.

3206:   Collective

3208:   Input Parameters:
3209: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3210: . m    - number of rows
3211: . n    - number of columns
3212: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3213: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

3215:   Output Parameter:
3216: . A - the matrix

3218:   Level: intermediate

3220:   Notes:
3221:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3222:   `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3223:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]

3225:   The AIJ format (compressed row storage), is fully compatible with standard Fortran
3226:   storage.  That is, the stored row and column indices can begin at
3227:   either one (as in Fortran) or zero.

3229:   Specify the preallocated storage with either `nz` or `nnz` (not both).
3230:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3231:   allocation.

3233: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3234: @*/
3235: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3236: {
3237:   PetscFunctionBegin;
3238:   PetscCall(MatCreate(comm, A));
3239:   PetscCall(MatSetSizes(*A, m, n, m, n));
3240:   PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3241:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3242:   PetscFunctionReturn(PETSC_SUCCESS);
3243: }

3245: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3246: {
3247:   PetscFunctionBegin;
3248:   if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3249:   else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3250:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3251:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3252:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3253:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3254:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3255:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3256:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3257:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3258:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3259:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3260:   PetscCall(MatDestroy_SeqAIJ(A));
3261:   PetscFunctionReturn(PETSC_SUCCESS);
3262: }

3264: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3265: {
3266:   PetscFunctionBegin;
3267:   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3268:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3269:   PetscFunctionReturn(PETSC_SUCCESS);
3270: }

3272: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3273: {
3274:   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3275:   Mat_SeqAIJHIPSPARSE *cy;
3276:   Mat_SeqAIJHIPSPARSE *cx;
3277:   PetscScalar         *ay;
3278:   const PetscScalar   *ax;
3279:   CsrMatrix           *csry, *csrx;

3281:   PetscFunctionBegin;
3282:   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3283:   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3284:   if (X->ops->axpy != Y->ops->axpy) {
3285:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3286:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3287:     PetscFunctionReturn(PETSC_SUCCESS);
3288:   }
3289:   /* if we are here, it means both matrices are bound to GPU */
3290:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3291:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3292:   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3293:   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3294:   csry = (CsrMatrix *)cy->mat->mat;
3295:   csrx = (CsrMatrix *)cx->mat->mat;
3296:   /* see if we can turn this into a hipblas axpy */
3297:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3298:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3299:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3300:     if (eq) str = SAME_NONZERO_PATTERN;
3301:   }
3302:   /* spgeam is buggy with one column */
3303:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3304:   if (str == SUBSET_NONZERO_PATTERN) {
3305:     PetscScalar b = 1.0;
3306: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3307:     size_t bufferSize;
3308:     void  *buffer;
3309: #endif

3311:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3312:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3313:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3314: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3315:     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3316:                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3317:     PetscCallHIP(hipMalloc(&buffer, bufferSize));
3318:     PetscCall(PetscLogGpuTimeBegin());
3319:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3320:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3321:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3322:     PetscCall(PetscLogGpuTimeEnd());
3323:     PetscCallHIP(hipFree(buffer));
3324: #else
3325:     PetscCall(PetscLogGpuTimeBegin());
3326:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3327:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3328:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3329:     PetscCall(PetscLogGpuTimeEnd());
3330: #endif
3331:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3332:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3333:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3334:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3335:   } else if (str == SAME_NONZERO_PATTERN) {
3336:     hipblasHandle_t hipblasv2handle;
3337:     PetscBLASInt    one = 1, bnz = 1;

3339:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3340:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3341:     PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3342:     PetscCall(PetscBLASIntCast(x->nz, &bnz));
3343:     PetscCall(PetscLogGpuTimeBegin());
3344:     PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3345:     PetscCall(PetscLogGpuFlops(2.0 * bnz));
3346:     PetscCall(PetscLogGpuTimeEnd());
3347:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3348:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3349:     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3350:   } else {
3351:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3352:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3353:   }
3354:   PetscFunctionReturn(PETSC_SUCCESS);
3355: }

3357: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3358: {
3359:   Mat_SeqAIJ     *y = (Mat_SeqAIJ *)Y->data;
3360:   PetscScalar    *ay;
3361:   hipblasHandle_t hipblasv2handle;
3362:   PetscBLASInt    one = 1, bnz = 1;

3364:   PetscFunctionBegin;
3365:   PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3366:   PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3367:   PetscCall(PetscBLASIntCast(y->nz, &bnz));
3368:   PetscCall(PetscLogGpuTimeBegin());
3369:   PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3370:   PetscCall(PetscLogGpuFlops(bnz));
3371:   PetscCall(PetscLogGpuTimeEnd());
3372:   PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3373:   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3374:   PetscFunctionReturn(PETSC_SUCCESS);
3375: }

3377: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3378: {
3379:   PetscBool   both = PETSC_FALSE;
3380:   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;

3382:   PetscFunctionBegin;
3383:   if (A->factortype == MAT_FACTOR_NONE) {
3384:     Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3385:     if (spptr->mat) {
3386:       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3387:       if (matrix->values) {
3388:         both = PETSC_TRUE;
3389:         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3390:       }
3391:     }
3392:     if (spptr->matTranspose) {
3393:       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3394:       if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3395:     }
3396:   }
3397:   //PetscCall(MatZeroEntries_SeqAIJ(A));
3398:   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3399:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3400:   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3401:   else A->offloadmask = PETSC_OFFLOAD_CPU;
3402:   PetscFunctionReturn(PETSC_SUCCESS);
3403: }

3405: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3406: {
3407:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

3409:   PetscFunctionBegin;
3410:   if (A->factortype != MAT_FACTOR_NONE) {
3411:     A->boundtocpu = flg;
3412:     PetscFunctionReturn(PETSC_SUCCESS);
3413:   }
3414:   if (flg) {
3415:     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));

3417:     A->ops->scale                     = MatScale_SeqAIJ;
3418:     A->ops->axpy                      = MatAXPY_SeqAIJ;
3419:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3420:     A->ops->mult                      = MatMult_SeqAIJ;
3421:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3422:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3423:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3424:     A->ops->multhermitiantranspose    = NULL;
3425:     A->ops->multhermitiantransposeadd = NULL;
3426:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3427:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3428:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3429:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3430:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3431:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3432:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3433:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3434:   } else {
3435:     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
3436:     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
3437:     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
3438:     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
3439:     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
3440:     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
3441:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3442:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3443:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3444:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3445:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3446:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3447:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3448:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3449:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3450:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3451:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3452:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3453:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3454:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3455:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3456:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3457:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3458:   }
3459:   A->boundtocpu = flg;
3460:   if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3461:   else a->inode.use = PETSC_FALSE;
3462:   PetscFunctionReturn(PETSC_SUCCESS);
3463: }

3465: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3466: {
3467:   Mat B;

3469:   PetscFunctionBegin;
3470:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3471:   if (reuse == MAT_INITIAL_MATRIX) {
3472:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3473:   } else if (reuse == MAT_REUSE_MATRIX) {
3474:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3475:   }
3476:   B = *newmat;
3477:   PetscCall(PetscFree(B->defaultvectype));
3478:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3479:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3480:     if (B->factortype == MAT_FACTOR_NONE) {
3481:       Mat_SeqAIJHIPSPARSE *spptr;
3482:       PetscCall(PetscNew(&spptr));
3483:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3484:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3485:       spptr->format = MAT_HIPSPARSE_CSR;
3486: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3487:       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3488: #else
3489:       spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3490: #endif
3491:       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3492:       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;

3494:       B->spptr = spptr;
3495:     } else {
3496:       Mat_SeqAIJHIPSPARSETriFactors *spptr;

3498:       PetscCall(PetscNew(&spptr));
3499:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3500:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3501:       B->spptr = spptr;
3502:     }
3503:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3504:   }
3505:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJHIPSPARSE;
3506:   B->ops->destroy        = MatDestroy_SeqAIJHIPSPARSE;
3507:   B->ops->setoption      = MatSetOption_SeqAIJHIPSPARSE;
3508:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3509:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJHIPSPARSE;
3510:   B->ops->duplicate      = MatDuplicate_SeqAIJHIPSPARSE;

3512:   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3513:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3514:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3515: #if defined(PETSC_HAVE_HYPRE)
3516:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3517: #endif
3518:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3519:   PetscFunctionReturn(PETSC_SUCCESS);
3520: }

3522: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3523: {
3524:   PetscFunctionBegin;
3525:   PetscCall(MatCreate_SeqAIJ(B));
3526:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3527:   PetscFunctionReturn(PETSC_SUCCESS);
3528: }

3530: /*MC
3531:    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs

3533:    A matrix type whose data resides on AMD GPUs. These matrices can be in either
3534:    CSR, ELL, or Hybrid format.
3535:    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.

3537:    Options Database Keys:
3538: +  -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3539: .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3540:                                        Other options include ell (ellpack) or hyb (hybrid).
3541: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3542: -  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU

3544:   Level: beginner

3546: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3547: M*/

3549: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3550: {
3551:   PetscFunctionBegin;
3552:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3553:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3554:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3555:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3556:   PetscFunctionReturn(PETSC_SUCCESS);
3557: }

3559: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3560: {
3561:   Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);

3563:   PetscFunctionBegin;
3564:   if (cusp) {
3565:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3566:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3567:     delete cusp->workVector;
3568:     delete cusp->rowoffsets_gpu;
3569:     delete cusp->csr2csc_i;
3570:     delete cusp->coords;
3571:     if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3572:     PetscCall(PetscFree(mat->spptr));
3573:   }
3574:   PetscFunctionReturn(PETSC_SUCCESS);
3575: }

3577: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3578: {
3579:   PetscFunctionBegin;
3580:   if (*mat) {
3581:     delete (*mat)->values;
3582:     delete (*mat)->column_indices;
3583:     delete (*mat)->row_offsets;
3584:     delete *mat;
3585:     *mat = 0;
3586:   }
3587:   PetscFunctionReturn(PETSC_SUCCESS);
3588: }

3590: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3591: {
3592:   PetscFunctionBegin;
3593:   if (*trifactor) {
3594:     if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3595:     if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3596:     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3597:     if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3598:     if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3599:     if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3600:     PetscCall(PetscFree(*trifactor));
3601:   }
3602:   PetscFunctionReturn(PETSC_SUCCESS);
3603: }

3605: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3606: {
3607:   CsrMatrix *mat;

3609:   PetscFunctionBegin;
3610:   if (*matstruct) {
3611:     if ((*matstruct)->mat) {
3612:       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3613:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3614:         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3615:       } else {
3616:         mat = (CsrMatrix *)(*matstruct)->mat;
3617:         PetscCall(CsrMatrix_Destroy(&mat));
3618:       }
3619:     }
3620:     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3621:     delete (*matstruct)->cprowIndices;
3622:     if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3623:     if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3624:     if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));

3626:     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3627:     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3628:     for (int i = 0; i < 3; i++) {
3629:       if (mdata->hipSpMV[i].initialized) {
3630:         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3631:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3632:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3633:       }
3634:     }
3635:     delete *matstruct;
3636:     *matstruct = NULL;
3637:   }
3638:   PetscFunctionReturn(PETSC_SUCCESS);
3639: }

3641: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3642: {
3643:   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;

3645:   PetscFunctionBegin;
3646:   if (fs) {
3647:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3648:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3649:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3650:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3651:     delete fs->rpermIndices;
3652:     delete fs->cpermIndices;
3653:     delete fs->workVector;
3654:     fs->rpermIndices  = NULL;
3655:     fs->cpermIndices  = NULL;
3656:     fs->workVector    = NULL;
3657:     fs->init_dev_prop = PETSC_FALSE;
3658: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3659:     PetscCallHIP(hipFree(fs->csrRowPtr));
3660:     PetscCallHIP(hipFree(fs->csrColIdx));
3661:     PetscCallHIP(hipFree(fs->csrVal));
3662:     PetscCallHIP(hipFree(fs->X));
3663:     PetscCallHIP(hipFree(fs->Y));
3664:     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3665:     PetscCallHIP(hipFree(fs->spsvBuffer_L));
3666:     PetscCallHIP(hipFree(fs->spsvBuffer_U));
3667:     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3668:     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3669:     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3670:     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3671:     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3672:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3673:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3674:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3675:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3676:     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3677:     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3678:     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3679:     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));

3681:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
3682:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3683: #endif
3684:   }
3685:   PetscFunctionReturn(PETSC_SUCCESS);
3686: }

3688: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3689: {
3690:   hipsparseHandle_t handle;

3692:   PetscFunctionBegin;
3693:   if (*trifactors) {
3694:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3695:     if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3696:     PetscCall(PetscFree(*trifactors));
3697:   }
3698:   PetscFunctionReturn(PETSC_SUCCESS);
3699: }

3701: struct IJCompare {
3702:   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3703:   {
3704:     if (t1.get<0>() < t2.get<0>()) return true;
3705:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3706:     return false;
3707:   }
3708: };

3710: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3711: {
3712:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

3714:   PetscFunctionBegin;
3715:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3716:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3717:   if (destroy) {
3718:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3719:     delete cusp->csr2csc_i;
3720:     cusp->csr2csc_i = NULL;
3721:   }
3722:   A->transupdated = PETSC_FALSE;
3723:   PetscFunctionReturn(PETSC_SUCCESS);
3724: }

3726: static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(void *data)
3727: {
3728:   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;

3730:   PetscFunctionBegin;
3731:   PetscCallHIP(hipFree(coo->perm));
3732:   PetscCallHIP(hipFree(coo->jmap));
3733:   PetscCall(PetscFree(coo));
3734:   PetscFunctionReturn(PETSC_SUCCESS);
3735: }

3737: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3738: {
3739:   PetscBool            dev_ij = PETSC_FALSE;
3740:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
3741:   PetscInt            *i, *j;
3742:   PetscContainer       container_h, container_d;
3743:   MatCOOStruct_SeqAIJ *coo_h, *coo_d;

3745:   PetscFunctionBegin;
3746:   PetscCall(PetscGetMemType(coo_i, &mtype));
3747:   if (PetscMemTypeDevice(mtype)) {
3748:     dev_ij = PETSC_TRUE;
3749:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3750:     PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3751:     PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3752:   } else {
3753:     i = coo_i;
3754:     j = coo_j;
3755:   }
3756:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3757:   if (dev_ij) PetscCall(PetscFree2(i, j));
3758:   mat->offloadmask = PETSC_OFFLOAD_CPU;
3759:   // Create the GPU memory
3760:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));

3762:   // Copy the COO struct to device
3763:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3764:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
3765:   PetscCall(PetscMalloc1(1, &coo_d));
3766:   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3767:   PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3768:   PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3769:   PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3770:   PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));

3772:   // Put the COO struct in a container and then attach that to the matrix
3773:   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
3774:   PetscCall(PetscContainerSetPointer(container_d, coo_d));
3775:   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3776:   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
3777:   PetscCall(PetscContainerDestroy(&container_d));
3778:   PetscFunctionReturn(PETSC_SUCCESS);
3779: }

3781: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3782: {
3783:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
3784:   const PetscCount grid_size = gridDim.x * blockDim.x;
3785:   for (; i < nnz; i += grid_size) {
3786:     PetscScalar sum = 0.0;
3787:     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3788:     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3789:   }
3790: }

3792: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3793: {
3794:   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
3795:   Mat_SeqAIJHIPSPARSE *dev  = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3796:   PetscCount           Annz = seq->nz;
3797:   PetscMemType         memtype;
3798:   const PetscScalar   *v1 = v;
3799:   PetscScalar         *Aa;
3800:   PetscContainer       container;
3801:   MatCOOStruct_SeqAIJ *coo;

3803:   PetscFunctionBegin;
3804:   if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));

3806:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3807:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

3809:   PetscCall(PetscGetMemType(v, &memtype));
3810:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3811:     PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3812:     PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3813:   }

3815:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3816:   else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));

3818:   PetscCall(PetscLogGpuTimeBegin());
3819:   if (Annz) {
3820:     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3821:     PetscCallHIP(hipPeekAtLastError());
3822:   }
3823:   PetscCall(PetscLogGpuTimeEnd());

3825:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3826:   else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));

3828:   if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3829:   PetscFunctionReturn(PETSC_SUCCESS);
3830: }

3832: /*@C
3833:   MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.

3835:   Not Collective

3837:   Input Parameters:
3838: + A          - the matrix
3839: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

3841:   Output Parameters:
3842: + i - the CSR row pointers
3843: - j - the CSR column indices

3845:   Level: developer

3847:   Note:
3848:   When compressed is true, the CSR structure does not contain empty rows

3850: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3851: @*/
3852: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3853: {
3854:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3855:   Mat_SeqAIJ          *a    = (Mat_SeqAIJ *)A->data;
3856:   CsrMatrix           *csr;

3858:   PetscFunctionBegin;
3860:   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3861:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3862:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3863:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3864:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3865:   csr = (CsrMatrix *)cusp->mat->mat;
3866:   if (i) {
3867:     if (!compressed && a->compressedrow.use) { /* need full row offset */
3868:       if (!cusp->rowoffsets_gpu) {
3869:         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3870:         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3871:         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3872:       }
3873:       *i = cusp->rowoffsets_gpu->data().get();
3874:     } else *i = csr->row_offsets->data().get();
3875:   }
3876:   if (j) *j = csr->column_indices->data().get();
3877:   PetscFunctionReturn(PETSC_SUCCESS);
3878: }

3880: /*@C
3881:   MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`

3883:   Not Collective

3885:   Input Parameters:
3886: + A          - the matrix
3887: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3888: . i          - the CSR row pointers
3889: - j          - the CSR column indices

3891:   Level: developer

3893: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3894: @*/
3895: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3896: {
3897:   PetscFunctionBegin;
3899:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3900:   if (i) *i = NULL;
3901:   if (j) *j = NULL;
3902:   PetscFunctionReturn(PETSC_SUCCESS);
3903: }

3905: /*@C
3906:   MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3908:   Not Collective

3910:   Input Parameter:
3911: . A - a `MATSEQAIJHIPSPARSE` matrix

3913:   Output Parameter:
3914: . a - pointer to the device data

3916:   Level: developer

3918:   Note:
3919:   May trigger host-device copies if the up-to-date matrix data is on host

3921: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3922: @*/
3923: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3924: {
3925:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3926:   CsrMatrix           *csr;

3928:   PetscFunctionBegin;
3930:   PetscAssertPointer(a, 2);
3931:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3932:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3933:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3934:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3935:   csr = (CsrMatrix *)cusp->mat->mat;
3936:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3937:   *a = csr->values->data().get();
3938:   PetscFunctionReturn(PETSC_SUCCESS);
3939: }

3941: /*@C
3942:   MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`

3944:   Not Collective

3946:   Input Parameters:
3947: + A - a `MATSEQAIJHIPSPARSE` matrix
3948: - a - pointer to the device data

3950:   Level: developer

3952: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3953: @*/
3954: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3955: {
3956:   PetscFunctionBegin;
3958:   PetscAssertPointer(a, 2);
3959:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3960:   *a = NULL;
3961:   PetscFunctionReturn(PETSC_SUCCESS);
3962: }

3964: /*@C
3965:   MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3967:   Not Collective

3969:   Input Parameter:
3970: . A - a `MATSEQAIJHIPSPARSE` matrix

3972:   Output Parameter:
3973: . a - pointer to the device data

3975:   Level: developer

3977:   Note:
3978:   May trigger host-device copies if up-to-date matrix data is on host

3980: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3981: @*/
3982: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
3983: {
3984:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3985:   CsrMatrix           *csr;

3987:   PetscFunctionBegin;
3989:   PetscAssertPointer(a, 2);
3990:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3991:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3992:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3993:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3994:   csr = (CsrMatrix *)cusp->mat->mat;
3995:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3996:   *a             = csr->values->data().get();
3997:   A->offloadmask = PETSC_OFFLOAD_GPU;
3998:   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3999:   PetscFunctionReturn(PETSC_SUCCESS);
4000: }
4001: /*@C
4002:   MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`

4004:   Not Collective

4006:   Input Parameters:
4007: + A - a `MATSEQAIJHIPSPARSE` matrix
4008: - a - pointer to the device data

4010:   Level: developer

4012: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4013: @*/
4014: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4015: {
4016:   PetscFunctionBegin;
4018:   PetscAssertPointer(a, 2);
4019:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4020:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4021:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4022:   *a = NULL;
4023:   PetscFunctionReturn(PETSC_SUCCESS);
4024: }

4026: /*@C
4027:   MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

4029:   Not Collective

4031:   Input Parameter:
4032: . A - a `MATSEQAIJHIPSPARSE` matrix

4034:   Output Parameter:
4035: . a - pointer to the device data

4037:   Level: developer

4039:   Note:
4040:   Does not trigger host-device copies and flags data validity on the GPU

4042: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4043: @*/
4044: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4045: {
4046:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4047:   CsrMatrix           *csr;

4049:   PetscFunctionBegin;
4051:   PetscAssertPointer(a, 2);
4052:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4053:   PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4054:   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4055:   csr = (CsrMatrix *)cusp->mat->mat;
4056:   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4057:   *a             = csr->values->data().get();
4058:   A->offloadmask = PETSC_OFFLOAD_GPU;
4059:   PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4060:   PetscFunctionReturn(PETSC_SUCCESS);
4061: }

4063: /*@C
4064:   MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`

4066:   Not Collective

4068:   Input Parameters:
4069: + A - a `MATSEQAIJHIPSPARSE` matrix
4070: - a - pointer to the device data

4072:   Level: developer

4074: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4075: @*/
4076: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4077: {
4078:   PetscFunctionBegin;
4080:   PetscAssertPointer(a, 2);
4081:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4082:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4083:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4084:   *a = NULL;
4085:   PetscFunctionReturn(PETSC_SUCCESS);
4086: }

4088: struct IJCompare4 {
4089:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4090:   {
4091:     if (t1.get<0>() < t2.get<0>()) return true;
4092:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4093:     return false;
4094:   }
4095: };

4097: struct Shift {
4098:   int _shift;

4100:   Shift(int shift) : _shift(shift) { }
4101:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4102: };

4104: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4105: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4106: {
4107:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4108:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4109:   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4110:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
4111:   PetscInt                       Annz, Bnnz;
4112:   PetscInt                       i, m, n, zero = 0;

4114:   PetscFunctionBegin;
4117:   PetscAssertPointer(C, 4);
4118:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4119:   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4120:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4121:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4122:   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4123:   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4124:   if (reuse == MAT_INITIAL_MATRIX) {
4125:     m = A->rmap->n;
4126:     n = A->cmap->n + B->cmap->n;
4127:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
4128:     PetscCall(MatSetSizes(*C, m, n, m, n));
4129:     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4130:     c                       = (Mat_SeqAIJ *)(*C)->data;
4131:     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4132:     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
4133:     Ccsr                    = new CsrMatrix;
4134:     Cmat->cprowIndices      = NULL;
4135:     c->compressedrow.use    = PETSC_FALSE;
4136:     c->compressedrow.nrows  = 0;
4137:     c->compressedrow.i      = NULL;
4138:     c->compressedrow.rindex = NULL;
4139:     Ccusp->workVector       = NULL;
4140:     Ccusp->nrows            = m;
4141:     Ccusp->mat              = Cmat;
4142:     Ccusp->mat->mat         = Ccsr;
4143:     Ccsr->num_rows          = m;
4144:     Ccsr->num_cols          = n;
4145:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4146:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4147:     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4148:     PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4149:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4150:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4151:     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4152:     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4153:     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4154:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4155:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4156:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4157:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");

4159:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4160:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4161:     Annz                 = (PetscInt)Acsr->column_indices->size();
4162:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4163:     c->nz                = Annz + Bnnz;
4164:     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4165:     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4166:     Ccsr->values         = new THRUSTARRAY(c->nz);
4167:     Ccsr->num_entries    = c->nz;
4168:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4169:     if (c->nz) {
4170:       auto              Acoo = new THRUSTINTARRAY32(Annz);
4171:       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
4172:       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
4173:       THRUSTINTARRAY32 *Aroff, *Broff;

4175:       if (a->compressedrow.use) { /* need full row offset */
4176:         if (!Acusp->rowoffsets_gpu) {
4177:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4178:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4179:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4180:         }
4181:         Aroff = Acusp->rowoffsets_gpu;
4182:       } else Aroff = Acsr->row_offsets;
4183:       if (b->compressedrow.use) { /* need full row offset */
4184:         if (!Bcusp->rowoffsets_gpu) {
4185:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4186:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4187:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4188:         }
4189:         Broff = Bcusp->rowoffsets_gpu;
4190:       } else Broff = Bcsr->row_offsets;
4191:       PetscCall(PetscLogGpuTimeBegin());
4192:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4193:       PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4194:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4195:       auto Aperm = thrust::make_constant_iterator(1);
4196:       auto Bperm = thrust::make_constant_iterator(0);
4197:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4198:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4199:       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4200:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4201:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4202:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4203:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4204:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4205:       auto p1    = Ccusp->coords->begin();
4206:       auto p2    = Ccusp->coords->begin();
4207:       thrust::advance(p2, Annz);
4208:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4209:       auto cci = thrust::make_counting_iterator(zero);
4210:       auto cce = thrust::make_counting_iterator(c->nz);
4211: #if 0 //Errors on SUMMIT cuda 11.1.0
4212:       PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4213: #else
4214:       auto pred = thrust::identity<int>();
4215:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4216:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4217: #endif
4218:       PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4219:       PetscCall(PetscLogGpuTimeEnd());
4220:       delete wPerm;
4221:       delete Acoo;
4222:       delete Bcoo;
4223:       delete Ccoo;
4224:       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));

4226:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4227:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4228:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4229:         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4230:         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4231:         CsrMatrix                     *CcsrT = new CsrMatrix;
4232:         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4233:         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

4235:         (*C)->form_explicit_transpose = PETSC_TRUE;
4236:         (*C)->transupdated            = PETSC_TRUE;
4237:         Ccusp->rowoffsets_gpu         = NULL;
4238:         CmatT->cprowIndices           = NULL;
4239:         CmatT->mat                    = CcsrT;
4240:         CcsrT->num_rows               = n;
4241:         CcsrT->num_cols               = m;
4242:         CcsrT->num_entries            = c->nz;
4243:         CcsrT->row_offsets            = new THRUSTINTARRAY32(n + 1);
4244:         CcsrT->column_indices         = new THRUSTINTARRAY32(c->nz);
4245:         CcsrT->values                 = new THRUSTARRAY(c->nz);

4247:         PetscCall(PetscLogGpuTimeBegin());
4248:         auto rT = CcsrT->row_offsets->begin();
4249:         if (AT) {
4250:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4251:           thrust::advance(rT, -1);
4252:         }
4253:         if (BT) {
4254:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4255:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4256:           thrust::copy(titb, tite, rT);
4257:         }
4258:         auto cT = CcsrT->column_indices->begin();
4259:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4260:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4261:         auto vT = CcsrT->values->begin();
4262:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4263:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4264:         PetscCall(PetscLogGpuTimeEnd());

4266:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4267:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4268:         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4269:         PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4270:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4271:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4272:         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4273:         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4274:         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

4276:         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4277:         Ccusp->matTranspose = CmatT;
4278:       }
4279:     }

4281:     c->singlemalloc = PETSC_FALSE;
4282:     c->free_a       = PETSC_TRUE;
4283:     c->free_ij      = PETSC_TRUE;
4284:     PetscCall(PetscMalloc1(m + 1, &c->i));
4285:     PetscCall(PetscMalloc1(c->nz, &c->j));
4286:     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4287:       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4288:       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4289:       ii = *Ccsr->row_offsets;
4290:       jj = *Ccsr->column_indices;
4291:       PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4292:       PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4293:     } else {
4294:       PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4295:       PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4296:     }
4297:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4298:     PetscCall(PetscMalloc1(m, &c->ilen));
4299:     PetscCall(PetscMalloc1(m, &c->imax));
4300:     c->maxnz         = c->nz;
4301:     c->nonzerorowcnt = 0;
4302:     c->rmax          = 0;
4303:     for (i = 0; i < m; i++) {
4304:       const PetscInt nn = c->i[i + 1] - c->i[i];
4305:       c->ilen[i] = c->imax[i] = nn;
4306:       c->nonzerorowcnt += (PetscInt) !!nn;
4307:       c->rmax = PetscMax(c->rmax, nn);
4308:     }
4309:     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4310:     PetscCall(PetscMalloc1(c->nz, &c->a));
4311:     (*C)->nonzerostate++;
4312:     PetscCall(PetscLayoutSetUp((*C)->rmap));
4313:     PetscCall(PetscLayoutSetUp((*C)->cmap));
4314:     Ccusp->nonzerostate = (*C)->nonzerostate;
4315:     (*C)->preallocated  = PETSC_TRUE;
4316:   } else {
4317:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4318:     c = (Mat_SeqAIJ *)(*C)->data;
4319:     if (c->nz) {
4320:       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4321:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4322:       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4323:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4324:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4325:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4326:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4327:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4328:       Acsr = (CsrMatrix *)Acusp->mat->mat;
4329:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4330:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4331:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4332:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4333:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4334:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4335:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4336:       auto pmid = Ccusp->coords->begin();
4337:       thrust::advance(pmid, Acsr->num_entries);
4338:       PetscCall(PetscLogGpuTimeBegin());
4339:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4340:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4341:       thrust::for_each(zibait, zieait, VecHIPEquals());
4342:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4343:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4344:       thrust::for_each(zibbit, ziebit, VecHIPEquals());
4345:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4346:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4347:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4348:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4349:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4350:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4351:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4352:         auto       vT    = CcsrT->values->begin();
4353:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4354:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4355:         (*C)->transupdated = PETSC_TRUE;
4356:       }
4357:       PetscCall(PetscLogGpuTimeEnd());
4358:     }
4359:   }
4360:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4361:   (*C)->assembled     = PETSC_TRUE;
4362:   (*C)->was_assembled = PETSC_FALSE;
4363:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4364:   PetscFunctionReturn(PETSC_SUCCESS);
4365: }

4367: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4368: {
4369:   bool               dmem;
4370:   const PetscScalar *av;

4372:   PetscFunctionBegin;
4373:   dmem = isHipMem(v);
4374:   PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4375:   if (n && idx) {
4376:     THRUSTINTARRAY widx(n);
4377:     widx.assign(idx, idx + n);
4378:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));

4380:     THRUSTARRAY                    *w = NULL;
4381:     thrust::device_ptr<PetscScalar> dv;
4382:     if (dmem) dv = thrust::device_pointer_cast(v);
4383:     else {
4384:       w  = new THRUSTARRAY(n);
4385:       dv = w->data();
4386:     }
4387:     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);

4389:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4390:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4391:     thrust::for_each(zibit, zieit, VecHIPEquals());
4392:     if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4393:     delete w;
4394:   } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));

4396:   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4397:   PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4398:   PetscFunctionReturn(PETSC_SUCCESS);
4399: }