Actual source code: aijhipsparse.hip.cxx

  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the HIPSPARSE library,
  4:   Portions of this code are under:
  5:   Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  6: */
  7: #include <petscconf.h>
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 10: #include <../src/mat/impls/dense/seq/dense.h>
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petsc/private/vecimpl.h>
 13: #undef VecType
 14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
 15: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
 16: #include <thrust/adjacent_difference.h>
 17: #include <thrust/iterator/transform_iterator.h>
 18: #if PETSC_CPP_VERSION >= 14
 19:   #define PETSC_HAVE_THRUST_ASYNC 1
 20:   #include <thrust/async/for_each.h>
 21: #endif
 22: #include <thrust/iterator/constant_iterator.h>
 23: #include <thrust/iterator/discard_iterator.h>
 24: #include <thrust/binary_search.h>
 25: #include <thrust/remove.h>
 26: #include <thrust/sort.h>
 27: #include <thrust/unique.h>
 28: #include <thrust/gather.h>

 30: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
 31: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
 32: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
 33: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};

 35: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 36: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
 37: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
 38: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
 39: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
 40: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
 41: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
 42: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 43: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 44: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 45: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 46: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 47: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
 48: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
 49: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);

 51: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
 52: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
 53: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
 54: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);

 56: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
 57: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);

 59: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
 60: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
 61: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);

 63: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);

 65: // cusparseCreateCsr() separates types for row offsets and column indices in prototype, but requires them to have the same type at runtime!
 66: const hipsparseIndexType_t csrRowOffsetsType = PetscDefined(USE_64BIT_INDICES) ? HIPSPARSE_INDEX_64I : HIPSPARSE_INDEX_32I;
 67: const hipsparseIndexType_t csrColIndType     = PetscDefined(USE_64BIT_INDICES) ? HIPSPARSE_INDEX_64I : HIPSPARSE_INDEX_32I;

 69: using Csr2coo        = Petsc::mat::aij::cupm::impl::Csr2coo;
 70: using PetscIntToCInt = Petsc::mat::aij::cupm::impl::PetscIntToCInt;

 72: /*
 73: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
 74: {
 75:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 77:   PetscFunctionBegin;
 78:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 79:   hipsparsestruct->stream = stream;
 80:   PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
 81:   PetscFunctionReturn(PETSC_SUCCESS);
 82: }

 84: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
 85: {
 86:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;

 88:   PetscFunctionBegin;
 89:   PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
 90:   if (hipsparsestruct->handle != handle) {
 91:     if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
 92:     hipsparsestruct->handle = handle;
 93:   }
 94:   PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
 95:   PetscFunctionReturn(PETSC_SUCCESS);
 96: }

 98: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
 99: {
100:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
101:   PetscBool            flg;

103:   PetscFunctionBegin;
104:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
105:   if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
106:   if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
107:   PetscFunctionReturn(PETSC_SUCCESS);
108: }
109: */

111: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
112: {
113:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

115:   PetscFunctionBegin;
116:   switch (op) {
117:   case MAT_HIPSPARSE_MULT:
118:     hipsparsestruct->format = format;
119:     break;
120:   case MAT_HIPSPARSE_ALL:
121:     hipsparsestruct->format = format;
122:     break;
123:   default:
124:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
125:   }
126:   PetscFunctionReturn(PETSC_SUCCESS);
127: }

129: /*@
130:   MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
131:   operation. Only the `MatMult()` operation can use different GPU storage formats

133:   Not Collective

135:   Input Parameters:
136: + A      - Matrix of type `MATSEQAIJHIPSPARSE`
137: . op     - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
138:          `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
139: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)

141:   Level: intermediate

143: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
144: @*/
145: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
146: {
147:   PetscFunctionBegin;
149:   PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
150:   PetscFunctionReturn(PETSC_SUCCESS);
151: }

153: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
154: {
155:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

157:   PetscFunctionBegin;
158:   hipsparsestruct->use_cpu_solve = use_cpu;
159:   PetscFunctionReturn(PETSC_SUCCESS);
160: }

162: /*@
163:   MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.

165:   Input Parameters:
166: + A       - Matrix of type `MATSEQAIJHIPSPARSE`
167: - use_cpu - set flag for using the built-in CPU `MatSolve()`

169:   Level: intermediate

171:   Notes:
172:   The hipSparse LU solver currently computes the factors with the built-in CPU method
173:   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
174:   This method to specifies if the solve is done on the CPU or GPU (GPU is the default).

176: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
177: @*/
178: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
179: {
180:   PetscFunctionBegin;
182:   PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
183:   PetscFunctionReturn(PETSC_SUCCESS);
184: }

186: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
187: {
188:   PetscFunctionBegin;
189:   switch (op) {
190:   case MAT_FORM_EXPLICIT_TRANSPOSE:
191:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
192:     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
193:     A->form_explicit_transpose = flg;
194:     break;
195:   default:
196:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
197:     break;
198:   }
199:   PetscFunctionReturn(PETSC_SUCCESS);
200: }

202: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
203: {
204:   MatHIPSPARSEStorageFormat format;
205:   PetscBool                 flg;
206:   Mat_SeqAIJHIPSPARSE      *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;

208:   PetscFunctionBegin;
209:   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
210:   if (A->factortype == MAT_FACTOR_NONE) {
211:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
212:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
213:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
214:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
215:     PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
216:     if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
217:     PetscCall(
218:       PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
219:     /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
220:     PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
221:     PetscCall(
222:       PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
223:     PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
224:     /*
225:     PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
226:     PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
227:     */
228:   }
229:   PetscOptionsHeadEnd();
230:   PetscFunctionReturn(PETSC_SUCCESS);
231: }

233: static PetscErrorCode MatSeqAIJHIPSPARSEBuildFactoredMatrix_LU(Mat A)
234: {
235:   Mat_SeqAIJ                    *a  = static_cast<Mat_SeqAIJ *>(A->data);
236:   PetscInt                       m  = A->rmap->n;
237:   Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
238:   const PetscInt                *Ai = a->i, *Aj = a->j, *adiag;
239:   const MatScalar               *Aa = a->a;
240:   PetscInt                      *Mi, *Mj, Mnz;
241:   PetscScalar                   *Ma;

243:   PetscFunctionBegin;
244:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
245:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
246:     if (!fs->csrRowPtr) {                    // Is this the first time we are doing setup? Use csrRowPtr since it is not null even when m=0
247:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
248:       Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
249:       PetscCall(PetscMalloc1(m + 1, &Mi));
250:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
251:       PetscCall(PetscMalloc1(Mnz, &Ma));
252:       Mi[0] = 0;
253:       for (PetscInt i = 0; i < m; i++) {
254:         PetscInt llen = Ai[i + 1] - Ai[i];
255:         PetscInt ulen = adiag[i] - adiag[i + 1];
256:         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
257:         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
258:         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
259:         Mi[i + 1] = Mi[i] + llen + ulen;
260:       }
261:       // Copy M (L,U) from host to device
262:       PetscCallHIP(hipMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
263:       PetscCallHIP(hipMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
264:       PetscCallHIP(hipMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
265:       PetscCallHIP(hipMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), hipMemcpyHostToDevice));
266:       PetscCallHIP(hipMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, hipMemcpyHostToDevice));

268:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
269:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
270:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
271:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
272:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
273:       hipsparseFillMode_t fillMode = HIPSPARSE_FILL_MODE_LOWER;
274:       hipsparseDiagType_t diagType = HIPSPARSE_DIAG_TYPE_UNIT;

276:       PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
277:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

280:       fillMode = HIPSPARSE_FILL_MODE_UPPER;
281:       diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
282:       PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
283:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
284:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

286:       // Allocate work vectors in SpSv
287:       PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(*fs->X) * m));
288:       PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

290:       PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
291:       PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

293:       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = HIPSPARSE_OPERATION_NON_TRANSPOSE
294:       PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
295:       PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
296:       PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
297:       PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
298:       PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
299:       PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));

301:       // Record for reuse
302:       fs->csrRowPtr_h = Mi;
303:       fs->csrVal_h    = Ma;
304:       PetscCall(PetscFree(Mj));
305:     }
306:     // Copy the value
307:     Mi  = fs->csrRowPtr_h;
308:     Ma  = fs->csrVal_h;
309:     Mnz = Mi[m];
310:     for (PetscInt i = 0; i < m; i++) {
311:       PetscInt llen = Ai[i + 1] - Ai[i];
312:       PetscInt ulen = adiag[i] - adiag[i + 1];
313:       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
314:       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]];                                 // recover the diagonal entry
315:       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
316:     }
317:     PetscCallHIP(hipMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, hipMemcpyHostToDevice));

319:     {
320:       // Do hipsparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
321:       PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

323:       PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
324:       fs->updatedSpSVAnalysis          = PETSC_TRUE;
325:       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
326:     }
327:   }
328:   PetscFunctionReturn(PETSC_SUCCESS);
329: }

331: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
332: {
333:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
334:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
335:   IS                             isrow = a->row, isicol = a->icol;
336:   PetscBool                      row_identity, col_identity;
337:   PetscInt                       n = A->rmap->n;

339:   PetscFunctionBegin;
340:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
341:   PetscCall(MatSeqAIJHIPSPARSEBuildFactoredMatrix_LU(A));

343:   hipsparseTriFactors->nnz = a->nz;

345:   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
346:   /* lower triangular indices */
347:   PetscCall(ISIdentity(isrow, &row_identity));
348:   if (!row_identity && !hipsparseTriFactors->rpermIndices) {
349:     const PetscInt *r;

351:     PetscCall(ISGetIndices(isrow, &r));
352:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
353:     hipsparseTriFactors->rpermIndices->assign(r, r + n);
354:     PetscCall(ISRestoreIndices(isrow, &r));
355:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
356:   }

358:   /* upper triangular indices */
359:   PetscCall(ISIdentity(isicol, &col_identity));
360:   if (!col_identity && !hipsparseTriFactors->cpermIndices) {
361:     const PetscInt *c;

363:     PetscCall(ISGetIndices(isicol, &c));
364:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
365:     hipsparseTriFactors->cpermIndices->assign(c, c + n);
366:     PetscCall(ISRestoreIndices(isicol, &c));
367:     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
368:   }
369:   PetscFunctionReturn(PETSC_SUCCESS);
370: }

372: static PetscErrorCode MatSeqAIJHIPSPARSEBuildFactoredMatrix_Cholesky(Mat A)
373: {
374:   Mat_SeqAIJ                    *a  = static_cast<Mat_SeqAIJ *>(A->data);
375:   PetscInt                       m  = A->rmap->n;
376:   Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
377:   const PetscInt                *Ai = a->i, *Aj = a->j, *adiag;
378:   const MatScalar               *Aa = a->a;
379:   PetscInt                      *Mj, Mnz;
380:   PetscScalar                   *Ma, *D;

382:   PetscFunctionBegin;
383:   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
384:   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
385:     if (!fs->csrRowPtr) {                    // Is this the first time we are doing setup? Use csrRowPtr since it is not null even m=0
386:       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
387:       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
388:       Mnz = Ai[m]; // Unz (with the unit diagonal)
389:       PetscCall(PetscMalloc1(Mnz, &Ma));
390:       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
391:       PetscCall(PetscMalloc1(m, &D));    // the diagonal
392:       for (PetscInt i = 0; i < m; i++) {
393:         PetscInt ulen = Ai[i + 1] - Ai[i];
394:         Mj[Ai[i]]     = i;                                              // diagonal entry
395:         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
396:       }
397:       // Copy M (U) from host to device
398:       PetscCallHIP(hipMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
399:       PetscCallHIP(hipMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
400:       PetscCallHIP(hipMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
401:       PetscCallHIP(hipMalloc(&fs->diag, sizeof(*fs->diag) * m));
402:       PetscCallHIP(hipMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), hipMemcpyHostToDevice));
403:       PetscCallHIP(hipMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, hipMemcpyHostToDevice));

405:       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
406:       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
407:       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
408:       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
409:       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
410:       hipsparseFillMode_t fillMode = HIPSPARSE_FILL_MODE_UPPER;
411:       hipsparseDiagType_t diagType = HIPSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal

413:       PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
414:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
415:       PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

417:       // Allocate work vectors in SpSv
418:       PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(*fs->X) * m));
419:       PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));

421:       PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
422:       PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

424:       // Query buffer sizes for SpSV and then allocate buffers
425:       PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
426:       PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
427:       PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));

429:       PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
430:       PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
431:       PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));

433:       // Record for reuse
434:       fs->csrVal_h = Ma;
435:       fs->diag_h   = D;
436:       PetscCall(PetscFree(Mj));
437:     }
438:     // Copy the value
439:     Ma  = fs->csrVal_h;
440:     D   = fs->diag_h;
441:     Mnz = Ai[m];
442:     for (PetscInt i = 0; i < m; i++) {
443:       D[i]      = Aa[adiag[i]];   // actually Aa[adiag[i]] is the inverse of the diagonal
444:       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
445:       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
446:     }
447:     PetscCallHIP(hipMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, hipMemcpyHostToDevice));
448:     PetscCallHIP(hipMemcpy(fs->diag, D, sizeof(*D) * m, hipMemcpyHostToDevice));

450:     {
451:       // Do hipsparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
452:       PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
453:       PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
454:       fs->updatedSpSVAnalysis = PETSC_TRUE;
455:     }
456:   }
457:   PetscFunctionReturn(PETSC_SUCCESS);
458: }

460: // Solve Ut D U x = b
461: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_Cholesky(Mat A, Vec b, Vec x)
462: {
463:   Mat_SeqAIJHIPSPARSETriFactors        *fs  = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
464:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
465:   const PetscScalar                    *barray;
466:   PetscScalar                          *xarray;
467:   thrust::device_ptr<const PetscScalar> bGPU;
468:   thrust::device_ptr<PetscScalar>       xGPU;
469:   const hipsparseSpSVAlg_t              alg = HIPSPARSE_SPSV_ALG_DEFAULT;
470:   PetscInt                              m   = A->rmap->n;

472:   PetscFunctionBegin;
473:   PetscCall(PetscLogGpuTimeBegin());
474:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
475:   PetscCall(VecHIPGetArrayRead(b, &barray));
476:   xGPU = thrust::device_pointer_cast(xarray);
477:   bGPU = thrust::device_pointer_cast(barray);

479:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
480:   if (fs->rpermIndices)
481:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));

483:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));

485:   // Solve Ut Y = X
486:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
487: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
488:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut));
489: #else
490:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
491: #endif

493:   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
494:   // It is basically a vector element-wise multiplication, but cublas does not have it!
495:   auto multiplies = thrust::multiplies<PetscScalar>();
496:   PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));

498:   // Solve U X = Y
499:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray)); // if need to permute, we need to use the intermediate buffer X

501: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
502:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U));
503: #else
504:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U, fs->spsvBuffer_U));
505: #endif

507:   // Reorder X with the column permutation if needed, and put the result back to x
508:   if (fs->cpermIndices)
509:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
510:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));

512:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
513:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
514:   PetscCall(PetscLogGpuTimeEnd());
515:   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
516:   PetscFunctionReturn(PETSC_SUCCESS);
517: }

519: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
520: {
521:   Mat_SeqAIJ                    *a                   = (Mat_SeqAIJ *)A->data;
522:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
523:   IS                             ip                  = a->row;
524:   PetscBool                      perm_identity;
525:   PetscInt                       n = A->rmap->n;

527:   PetscFunctionBegin;
528:   PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");

530:   PetscCall(MatSeqAIJHIPSPARSEBuildFactoredMatrix_Cholesky(A));
531:   hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;

533:   A->offloadmask = PETSC_OFFLOAD_BOTH;

535:   /* lower triangular indices */
536:   PetscCall(ISIdentity(ip, &perm_identity));
537:   if (!perm_identity) {
538:     IS              iip;
539:     const PetscInt *irip, *rip;

541:     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
542:     PetscCall(ISGetIndices(iip, &irip));
543:     PetscCall(ISGetIndices(ip, &rip));
544:     hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545:     hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
546:     hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
547:     hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
548:     PetscCall(ISRestoreIndices(iip, &irip));
549:     PetscCall(ISDestroy(&iip));
550:     PetscCall(ISRestoreIndices(ip, &rip));
551:     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
552:   }
553:   PetscFunctionReturn(PETSC_SUCCESS);
554: }

556: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
557: {
558:   PetscFunctionBegin;
559:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
560:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
561:   B->offloadmask            = PETSC_OFFLOAD_CPU;
562:   B->ops->solve             = MatSolve_SeqAIJHIPSPARSE_Cholesky;
563:   B->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_Cholesky; // since symmetric
564:   B->ops->matsolve          = NULL;
565:   B->ops->matsolvetranspose = NULL;
566:   /* get the triangular factors */
567:   PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
568:   PetscFunctionReturn(PETSC_SUCCESS);
569: }

571: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
572: {
573:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
574:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
575:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data;
576:   hipsparseIndexBase_t           indexBase;

578:   PetscFunctionBegin;
579:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
580:   matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
581:   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
582:   matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
583:   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
584:   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
585:   PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
586:   PetscCall(PetscLogGpuTimeBegin());
587:   if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
588:   if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
589:     matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
590:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
591:     indexBase = hipsparseGetMatIndexBase(matstruct->descr);
592:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
593:     PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

595:     /* set alpha and beta */
596:     PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
597:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
598:     PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
599:     PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
600:     PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
601:     PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));

603:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
604:       CsrMatrix *matrixT      = new CsrMatrix;
605:       matstructT->mat         = matrixT;
606:       matrixT->num_rows       = A->cmap->n;
607:       matrixT->num_cols       = A->rmap->n;
608:       matrixT->num_entries    = a->nz;
609:       matrixT->row_offsets    = new THRUSTINTARRAY(matrixT->num_rows + 1);
610:       matrixT->column_indices = new THRUSTINTARRAY(a->nz);
611:       matrixT->values         = new THRUSTARRAY(a->nz);

613:       if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
614:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
615:       PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), csrRowOffsetsType, csrColIndType, indexBase, hipsparse_scalartype));
616:     } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
617:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_HIPSPARSE_ELL and MAT_HIPSPARSE_HYB are not supported");
618:     }
619:   }
620:   if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
621:     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
622:     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
623:     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
624:     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
625:     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
626:     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
627:     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
628:     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
629:     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
630:     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
631:     if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
632:       hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
633:       hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
634:       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
635:     }
636:     if (!hipsparsestruct->csr2csc_i) { // not using hipsparseCsr2cscEx2() because it requires 32-bit indices
637:       THRUSTINTARRAY row_indices(matrix->num_entries);

639:       // Transpose the matrix via COO, i.e., by putting the row indices in column_indices[] and the column indices in row_indices[]
640:       hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); // will store the matrix to matrixT permutation, i.e., entry matrixT[i] is matrix[csr2csc_i[i]]
641:       PetscCallThrust(thrust::sequence(thrust::device, hipsparsestruct->csr2csc_i->begin(), hipsparsestruct->csr2csc_i->end()));
642:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->rmap->n), Csr2coo(hipsparsestruct->rowoffsets_gpu->data().get(), matrixT->column_indices->data().get())));
643:       row_indices = *matrix->column_indices;
644:       // Sort the COO by row then column, and get the permutation csr2csc_i[]
645:       PetscCallThrust(thrust::sort_by_key(thrust::device, thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), matrixT->column_indices->begin())), thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), matrixT->column_indices->end())),
646:                                           hipsparsestruct->csr2csc_i->begin()));
647:       // Finalize matrixT's row_offsets by looking up row_indices[]
648:       PetscCallThrust(thrust::lower_bound(thrust::device, row_indices.begin(), row_indices.end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->cmap->n + 1), matrixT->row_offsets->begin()));
649:     }
650:     PetscCallThrust(thrust::gather(thrust::device, hipsparsestruct->csr2csc_i->begin(), hipsparsestruct->csr2csc_i->end(), matrix->values->begin(), matrixT->values->begin()));
651:   }
652:   PetscCall(PetscLogGpuTimeEnd());
653:   PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
654:   /* the compressed row indices is not used for matTranspose */
655:   matstructT->cprowIndices = NULL;
656:   /* assign the pointer */
657:   ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
658:   A->transupdated                                 = PETSC_TRUE;
659:   PetscFunctionReturn(PETSC_SUCCESS);
660: }

662: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_LU(Mat A, Vec b, Vec x)
663: {
664:   const PetscScalar                    *barray;
665:   PetscScalar                          *xarray;
666:   thrust::device_ptr<const PetscScalar> bGPU;
667:   thrust::device_ptr<PetscScalar>       xGPU;
668:   Mat_SeqAIJHIPSPARSETriFactors        *fs  = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
669:   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
670:   const hipsparseOperation_t            op  = HIPSPARSE_OPERATION_NON_TRANSPOSE;
671:   const hipsparseSpSVAlg_t              alg = HIPSPARSE_SPSV_ALG_DEFAULT;
672:   PetscInt                              m   = A->rmap->n;

674:   PetscFunctionBegin;
675:   PetscCall(PetscLogGpuTimeBegin());
676:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
677:   PetscCall(VecHIPGetArrayRead(b, &barray));
678:   xGPU = thrust::device_pointer_cast(xarray);
679:   bGPU = thrust::device_pointer_cast(barray);

681:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
682:   if (fs->rpermIndices)
683:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));

685:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));

687:   // Solve L Y = X
688:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
689:   // Note that hipsparseSpSV_solve() secretly uses the external buffer used in hipsparseSpSV_analysis()!

691: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
692:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_L));
693: #else
694:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_L, fs->spsvBuffer_L));
695: #endif

697:   // Solve U X = Y
698:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray));

700: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
701:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U));
702: #else
703:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U, fs->spsvBuffer_U));
704: #endif

706:   // Reorder X with the column permutation if needed, and put the result back to x
707:   if (fs->cpermIndices)
708:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
709:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
710:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
711:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
712:   PetscCall(PetscLogGpuTimeEnd());
713:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
714:   PetscFunctionReturn(PETSC_SUCCESS);
715: }

717: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_LU(Mat A, Vec b, Vec x)
718: {
719:   Mat_SeqAIJHIPSPARSETriFactors        *fs  = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
720:   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
721:   const PetscScalar                    *barray;
722:   PetscScalar                          *xarray;
723:   thrust::device_ptr<const PetscScalar> bGPU;
724:   thrust::device_ptr<PetscScalar>       xGPU;
725:   const hipsparseOperation_t            opA = HIPSPARSE_OPERATION_TRANSPOSE;
726:   const hipsparseSpSVAlg_t              alg = HIPSPARSE_SPSV_ALG_DEFAULT;
727:   PetscInt                              m   = A->rmap->n;

729:   PetscFunctionBegin;
730:   PetscCall(PetscLogGpuTimeBegin());
731:   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
732:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
733:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
734:                                                 fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

736:     PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
737:     PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
738:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
739:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
740:     fs->createdTransposeSpSVDescr = PETSC_TRUE;
741:   }

743:   if (!fs->updatedTransposeSpSVAnalysis) {
744:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));

746:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
747:     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
748:   }

750:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
751:   PetscCall(VecHIPGetArrayRead(b, &barray));
752:   xGPU = thrust::device_pointer_cast(xarray);
753:   bGPU = thrust::device_pointer_cast(barray);

755:   // Reorder b with the row permutation if needed, and wrap the result in fs->X
756:   if (fs->rpermIndices)
757:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));

759:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));

761:   // Solve Ut Y = X
762:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
763: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
764:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut));
765: #else
766:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
767: #endif

769:   // Solve Lt X = Y
770:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray)); // if need to permute, we need to use the intermediate buffer X

772: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
773:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_Lt));
774: #else
775:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
776: #endif

778:   // Reorder X with the column permutation if needed, and put the result back to x
779:   if (fs->cpermIndices)
780:     PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
781:                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));

783:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
784:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
785:   PetscCall(PetscLogGpuTimeEnd());
786:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
787:   PetscFunctionReturn(PETSC_SUCCESS);
788: }

790: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
791: {
792:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
793:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
794:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
795:   CsrMatrix                     *Acsr;
796:   PetscInt                       m, nz;
797:   PetscBool                      flg;

799:   PetscFunctionBegin;
800:   if (PetscDefined(USE_DEBUG)) {
801:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
802:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
803:   }

805:   /* Copy A's value to fact */
806:   m  = fact->rmap->n;
807:   nz = aij->nz;
808:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
809:   Acsr = (CsrMatrix *)Acusp->mat->mat;
810:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

812:   PetscCall(PetscLogGpuTimeBegin());
813:   /* Factorize fact inplace */
814:   if (m)
815:     PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
816:                                           fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
817:   if (PetscDefined(USE_DEBUG)) {
818:     int               numerical_zero;
819:     hipsparseStatus_t status;
820:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
821:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
822:   }

824:   {
825:     /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02()
826:      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
827:     */
828:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

830:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));

832:     fs->updatedSpSVAnalysis = PETSC_TRUE;
833:     /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
834:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
835:   }

837:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
838:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but hipsparseSpSV_solve() supports both 32 and 64. The info is encoded in hipsparseSpMatDescr_t.
839:   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJHIPSPARSE_LU;
840:   fact->ops->matsolve          = NULL;
841:   fact->ops->matsolvetranspose = NULL;
842:   PetscCall(PetscLogGpuTimeEnd());
843:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
844:   PetscFunctionReturn(PETSC_SUCCESS);
845: }

847: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
848: {
849:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
850:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
851:   PetscInt                       m, nz;

853:   PetscFunctionBegin;
854:   if (PetscDefined(USE_DEBUG)) {
855:     PetscBool flg, diagDense;

857:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
858:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
859:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
860:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
861:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
862:   }

864:   /* Free the old stale stuff */
865:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

867:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
868:      but they will not be used. Allocate them just for easy debugging.
869:    */
870:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

872:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
873:   fact->factortype             = MAT_FACTOR_ILU;
874:   fact->info.factor_mallocs    = 0;
875:   fact->info.fill_ratio_given  = info->fill;
876:   fact->info.fill_ratio_needed = 1.0;

878:   aij->row = NULL;
879:   aij->col = NULL;

881:   /* ====================================================================== */
882:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
883:   /* We'll do in-place factorization on fact                                */
884:   /* ====================================================================== */
885:   const PetscInt *Ai, *Aj;

887:   m  = fact->rmap->n;
888:   nz = aij->nz;

890:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
891:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
892:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
893:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed

895:   PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
896:   PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
897:   PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));

899:   /* ====================================================================== */
900:   /* Create descriptors for M, L, U                                         */
901:   /* ====================================================================== */
902:   hipsparseFillMode_t fillMode;
903:   hipsparseDiagType_t diagType;

905:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
906:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
907:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

909:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
910:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
911:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
912:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
913:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
914:   */
915:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
916:   diagType = HIPSPARSE_DIAG_TYPE_UNIT;
917:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
918:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
919:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

921:   fillMode = HIPSPARSE_FILL_MODE_UPPER;
922:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
923:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
924:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
925:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

927:   /* ========================================================================= */
928:   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
929:   /* ========================================================================= */
930:   PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
931:   if (m)
932:     PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
933:                                                      fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));

935:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
936:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

938:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
939:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

941:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
942:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

944:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
945:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));

947:   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
948:      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
949:      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
950:      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
951:    */
952:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
953:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
954:     fs->spsvBuffer_L = fs->factBuffer_M;
955:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
956:   } else {
957:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
958:     fs->spsvBuffer_U = fs->factBuffer_M;
959:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
960:   }

962:   /* ========================================================================== */
963:   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
964:   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
965:   /* ========================================================================== */
966:   int               structural_zero;
967:   hipsparseStatus_t status;

969:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
970:   if (m)
971:     PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
972:                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
973:   if (PetscDefined(USE_DEBUG)) {
974:     /* hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
975:     status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
976:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
977:   }

979:   /* Estimate FLOPs of the numeric factorization */
980:   {
981:     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ *)A->data;
982:     PetscInt       *Ai, nzRow, nzLeft;
983:     const PetscInt *adiag;
984:     PetscLogDouble  flops = 0.0;

986:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
987:     Ai = Aseq->i;
988:     for (PetscInt i = 0; i < m; i++) {
989:       if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
990:         nzRow  = Ai[i + 1] - Ai[i];
991:         nzLeft = adiag[i] - Ai[i];
992:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
993:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
994:         */
995:         nzLeft = (nzRow - 1) / 2;
996:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
997:       }
998:     }
999:     fs->numericFactFlops = flops;
1000:   }
1001:   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1002:   PetscFunctionReturn(PETSC_SUCCESS);
1003: }

1005: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1006: {
1007:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1008:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1009:   const PetscScalar             *barray;
1010:   PetscScalar                   *xarray;

1012:   PetscFunctionBegin;
1013:   PetscCall(VecHIPGetArrayWrite(x, &xarray));
1014:   PetscCall(VecHIPGetArrayRead(b, &barray));
1015:   PetscCall(PetscLogGpuTimeBegin());

1017:   /* Solve L*y = b */
1018:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1019:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1020: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1021:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1022:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1023: #else
1024:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1025:                                          fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1026: #endif
1027:   /* Solve Lt*x = y */
1028:   PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1029: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1030:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1031:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1032: #else
1033:   PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1034:                                          fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1035: #endif
1036:   PetscCall(VecHIPRestoreArrayRead(b, &barray));
1037:   PetscCall(VecHIPRestoreArrayWrite(x, &xarray));

1039:   PetscCall(PetscLogGpuTimeEnd());
1040:   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1041:   PetscFunctionReturn(PETSC_SUCCESS);
1042: }

1044: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1045: {
1046:   Mat_SeqAIJHIPSPARSETriFactors *fs    = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1047:   Mat_SeqAIJ                    *aij   = (Mat_SeqAIJ *)fact->data;
1048:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1049:   CsrMatrix                     *Acsr;
1050:   PetscInt                       m, nz;
1051:   PetscBool                      flg;

1053:   PetscFunctionBegin;
1054:   if (PetscDefined(USE_DEBUG)) {
1055:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1056:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1057:   }

1059:   /* Copy A's value to fact */
1060:   m  = fact->rmap->n;
1061:   nz = aij->nz;
1062:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1063:   Acsr = (CsrMatrix *)Acusp->mat->mat;
1064:   PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));

1066:   /* Factorize fact inplace */
1067:   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1068:      csric02() only takes the lower triangular part of matrix A to perform factorization.
1069:      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1070:      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1071:      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1072:    */
1073:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1074:   if (PetscDefined(USE_DEBUG)) {
1075:     int               numerical_zero;
1076:     hipsparseStatus_t status;
1077:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1078:     PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1079:   }

1081:   {
1082:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));

1084:     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1085:     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1086:   */
1087:     PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1088:     fs->updatedSpSVAnalysis = PETSC_TRUE;
1089:   }

1091:   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1092:   fact->ops->solve             = MatSolve_SeqAIJHIPSPARSE_ICC0;
1093:   fact->ops->solvetranspose    = MatSolve_SeqAIJHIPSPARSE_ICC0;
1094:   fact->ops->matsolve          = NULL;
1095:   fact->ops->matsolvetranspose = NULL;
1096:   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1097:   PetscFunctionReturn(PETSC_SUCCESS);
1098: }

1100: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1101: {
1102:   Mat_SeqAIJHIPSPARSETriFactors *fs  = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1103:   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ *)fact->data;
1104:   PetscInt                       m, nz;

1106:   PetscFunctionBegin;
1107:   if (PetscDefined(USE_DEBUG)) {
1108:     PetscBool flg, diagDense;

1110:     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1111:     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1112:     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1113:     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1114:     PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1115:   }

1117:   /* Free the old stale stuff */
1118:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));

1120:   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1121:      but they will not be used. Allocate them just for easy debugging.
1122:    */
1123:   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));

1125:   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1126:   fact->factortype             = MAT_FACTOR_ICC;
1127:   fact->info.factor_mallocs    = 0;
1128:   fact->info.fill_ratio_given  = info->fill;
1129:   fact->info.fill_ratio_needed = 1.0;

1131:   aij->row = NULL;
1132:   aij->col = NULL;

1134:   /* ====================================================================== */
1135:   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1136:   /* We'll do in-place factorization on fact                                */
1137:   /* ====================================================================== */
1138:   const PetscInt *Ai, *Aj;

1140:   m  = fact->rmap->n;
1141:   nz = aij->nz;

1143:   PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1144:   PetscCallHIP(hipMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1145:   PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1146:   PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed

1148:   PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
1149:   PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
1150:   PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));

1152:   /* ====================================================================== */
1153:   /* Create mat descriptors for M, L                                        */
1154:   /* ====================================================================== */
1155:   hipsparseFillMode_t fillMode;
1156:   hipsparseDiagType_t diagType;

1158:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1159:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1160:   PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));

1162:   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1163:     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1164:     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1165:     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1166:     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1167:   */
1168:   fillMode = HIPSPARSE_FILL_MODE_LOWER;
1169:   diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1170:   PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1171:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1172:   PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));

1174:   /* ========================================================================= */
1175:   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1176:   /* ========================================================================= */
1177:   PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1178:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));

1180:   PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1181:   PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));

1183:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1184:   PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));

1186:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1187:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));

1189:   PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1190:   PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));

1192:   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1193:      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1194:    */
1195:   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1196:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1197:     fs->spsvBuffer_L = fs->factBuffer_M;
1198:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1199:   } else {
1200:     PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1201:     fs->spsvBuffer_Lt = fs->factBuffer_M;
1202:     PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1203:   }

1205:   /* ========================================================================== */
1206:   /* Perform analysis of ic0 on M                                               */
1207:   /* The lower triangular part of M has the same sparsity pattern as L          */
1208:   /* ========================================================================== */
1209:   int               structural_zero;
1210:   hipsparseStatus_t status;

1212:   fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1213:   if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1214:   if (PetscDefined(USE_DEBUG)) {
1215:     /* hipsparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1216:     status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1217:     PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1218:   }

1220:   /* Estimate FLOPs of the numeric factorization */
1221:   {
1222:     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
1223:     PetscInt      *Ai, nzRow, nzLeft;
1224:     PetscLogDouble flops = 0.0;

1226:     Ai = Aseq->i;
1227:     for (PetscInt i = 0; i < m; i++) {
1228:       nzRow = Ai[i + 1] - Ai[i];
1229:       if (nzRow > 1) {
1230:         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1231:           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1232:         */
1233:         nzLeft = (nzRow - 1) / 2;
1234:         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1235:       }
1236:     }
1237:     fs->numericFactFlops = flops;
1238:   }
1239:   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1240:   PetscFunctionReturn(PETSC_SUCCESS);
1241: }

1243: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1244: {
1245:   // use_cpu_solve is a field in Mat_SeqAIJHIPSPARSE. B, a factored matrix, uses Mat_SeqAIJHIPSPARSETriFactors.
1246:   Mat_SeqAIJHIPSPARSE *hipsparsestruct = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);

1248:   PetscFunctionBegin;
1249:   PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1250:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1251:   B->offloadmask = PETSC_OFFLOAD_CPU;

1253:   if (!hipsparsestruct->use_cpu_solve) {
1254:     B->ops->solve          = MatSolve_SeqAIJHIPSPARSE_LU;
1255:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_LU;
1256:   }
1257:   B->ops->matsolve          = NULL;
1258:   B->ops->matsolvetranspose = NULL;

1260:   /* get the triangular factors */
1261:   if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
1262:   PetscFunctionReturn(PETSC_SUCCESS);
1263: }

1265: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1266: {
1267:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1269:   PetscFunctionBegin;
1270:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1271:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1272:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1273:   PetscFunctionReturn(PETSC_SUCCESS);
1274: }

1276: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1277: {
1278:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1280:   PetscFunctionBegin;
1281:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1282:   if (!info->factoronhost) {
1283:     PetscCall(ISIdentity(isrow, &row_identity));
1284:     PetscCall(ISIdentity(iscol, &col_identity));
1285:   }
1286:   if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1287:   else {
1288:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1289:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1290:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1291:   }
1292:   PetscFunctionReturn(PETSC_SUCCESS);
1293: }

1295: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1296: {
1297:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1299:   PetscFunctionBegin;
1300:   PetscBool perm_identity = PETSC_FALSE;
1301:   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1302:   if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1303:   else {
1304:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1305:     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1306:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1307:   }
1308:   PetscFunctionReturn(PETSC_SUCCESS);
1309: }

1311: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1312: {
1313:   Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;

1315:   PetscFunctionBegin;
1316:   PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1317:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1318:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1319:   PetscFunctionReturn(PETSC_SUCCESS);
1320: }

1322: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1323: {
1324:   PetscFunctionBegin;
1325:   *type = MATSOLVERHIPSPARSE;
1326:   PetscFunctionReturn(PETSC_SUCCESS);
1327: }

1329: /*MC
1330:   MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1331:   on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1332:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1333:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1334:   HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1335:   algorithms are not recommended. This class does NOT support direct solver operations.

1337:   Level: beginner

1339: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1340: M*/

1342: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1343: {
1344:   PetscInt n = A->rmap->n;

1346:   PetscFunctionBegin;
1347:   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1348:   PetscCall(MatSetSizes(*B, n, n, n, n));
1349:   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJHIPSPARSETriFactors
1350:   PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));

1352:   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1353:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1354:     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1355:     if (!A->boundtocpu) {
1356:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1357:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1358:     } else {
1359:       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1360:       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1361:     }
1362:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1363:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1364:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1365:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1366:     if (!A->boundtocpu) {
1367:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1368:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1369:     } else {
1370:       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1371:       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1372:     }
1373:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1374:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1375:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");

1377:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1378:   (*B)->canuseordering = PETSC_TRUE;
1379:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1380:   PetscFunctionReturn(PETSC_SUCCESS);
1381: }

1383: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1384: {
1385:   Mat_SeqAIJ                    *a    = (Mat_SeqAIJ *)A->data;
1386:   Mat_SeqAIJHIPSPARSE           *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1387:   Mat_SeqAIJHIPSPARSETriFactors *fs   = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;

1389:   PetscFunctionBegin;
1390:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1391:     PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1392:     if (A->factortype == MAT_FACTOR_NONE) {
1393:       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1394:       PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1395:     } else if (fs->csrVal) {
1396:       /* We have a factorized matrix on device and are able to copy it to host */
1397:       PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1398:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1399:     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1400:     PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1401:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1402:   }
1403:   PetscFunctionReturn(PETSC_SUCCESS);
1404: }

1406: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (HIP specialisation) */
1407: struct MatSeqAIJHIPSPARSE_Policy {
1408:   typedef Mat_SeqAIJHIPSPARSE           mat_struct_type;
1409:   typedef Mat_SeqAIJHIPSPARSEMultStruct mult_struct_type;

1411:   static int storage_format_csr() { return (int)MAT_HIPSPARSE_CSR; }
1412:   static int storage_format_ell() { return (int)MAT_HIPSPARSE_ELL; }
1413:   static int storage_format_hyb() { return (int)MAT_HIPSPARSE_HYB; }

1415:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJHIPSPARSECopyToGPU(A); }
1416:   static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJHIPSPARSECopyFromGPU(A); }
1417:   static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJHIPSPARSEInvalidateTranspose(A, d); }
1418:   static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, t, r, C); }
1419:   static const char    *mat_type_name;

1421:   static PetscErrorCode Destroy(Mat A) { return MatSeqAIJHIPSPARSE_Destroy(A); }
1422:   static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)spptr); }
1423:   static const char    *set_format_c;
1424:   static const char    *set_use_cpu_solve_c;
1425:   static const char    *product_seqdense_device_c;
1426:   static const char    *product_seqdense_c;
1427:   static const char    *product_self_c;
1428:   static const char    *seq_convert_hypre_c;

1430:   static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecHIPGetArrayRead(v, a); }
1431:   static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecHIPRestoreArrayRead(v, a); }
1432:   static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecHIPGetArrayWrite(v, a); }
1433:   static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecHIPRestoreArrayWrite(v, a); }
1434: };
1435: const char *MatSeqAIJHIPSPARSE_Policy::mat_type_name             = MATSEQAIJHIPSPARSE;
1436: const char *MatSeqAIJHIPSPARSE_Policy::set_format_c              = "MatHIPSPARSESetFormat_C";
1437: const char *MatSeqAIJHIPSPARSE_Policy::set_use_cpu_solve_c       = "MatHIPSPARSESetUseCPUSolve_C";
1438: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C";
1439: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_c        = "MatProductSetFromOptions_seqaijhipsparse_seqdense_C";
1440: const char *MatSeqAIJHIPSPARSE_Policy::product_self_c            = "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C";
1441: const char *MatSeqAIJHIPSPARSE_Policy::seq_convert_hypre_c       = "MatConvert_seqaijhipsparse_hypre_C";

1443: using MatSeqAIJHIPSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::HIP, MatSeqAIJHIPSPARSE_Policy>;

1445: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1446: {
1447:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1448: }

1450: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1451: {
1452:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1453: }

1455: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1456: {
1457:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1458: }

1460: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1461: {
1462:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1463: }

1465: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1466: {
1467:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1468: }

1470: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1471: {
1472:   return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1473: }

1475: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1476: {
1477:   Mat_SeqAIJHIPSPARSE *cusp;
1478:   CsrMatrix           *matrix;

1480:   PetscFunctionBegin;
1481:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1482:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1483:   cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1484:   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1485:   matrix = (CsrMatrix *)cusp->mat->mat;

1487:   if (i) *i = matrix->row_offsets->data().get();
1488:   if (j) *j = matrix->column_indices->data().get();
1489:   if (a) *a = matrix->values->data().get();
1490:   if (mtype) *mtype = PETSC_MEMTYPE_HIP;
1491:   PetscFunctionReturn(PETSC_SUCCESS);
1492: }

1494: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
1495: {
1496:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1497:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct       = hipsparsestruct->mat;
1498:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
1499:   PetscInt                       m               = A->rmap->n, *ii, *ridx, tmp;
1500:   PetscBool                      both            = PETSC_TRUE;

1502:   PetscFunctionBegin;
1503:   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1504:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1505:     if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
1506:       CsrMatrix *matrix;
1507:       matrix = (CsrMatrix *)hipsparsestruct->mat->mat;

1509:       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1510:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1511:       matrix->values->assign(a->a, a->a + a->nz);
1512:       PetscCallHIP(WaitForHIP());
1513:       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1514:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1515:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
1516:     } else {
1517:       PetscInt nnz;
1518:       PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1519:       PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
1520:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
1521:       delete hipsparsestruct->workVector;
1522:       delete hipsparsestruct->rowoffsets_gpu;
1523:       hipsparsestruct->workVector     = NULL;
1524:       hipsparsestruct->rowoffsets_gpu = NULL;
1525:       try {
1526:         if (a->compressedrow.use) {
1527:           m    = a->compressedrow.nrows;
1528:           ii   = a->compressedrow.i;
1529:           ridx = a->compressedrow.rindex;
1530:         } else {
1531:           m    = A->rmap->n;
1532:           ii   = a->i;
1533:           ridx = NULL;
1534:         }
1535:         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1536:         if (!a->a) {
1537:           nnz  = ii[m];
1538:           both = PETSC_FALSE;
1539:         } else nnz = a->nz;
1540:         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");

1542:         /* create cusparse matrix */
1543:         hipsparsestruct->nrows = m;
1544:         matstruct              = new Mat_SeqAIJHIPSPARSEMultStruct;
1545:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
1546:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
1547:         PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));

1549:         PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1550:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1551:         PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1552:         PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
1553:         PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
1554:         PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
1555:         PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));

1557:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1558:         if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
1559:           /* set the matrix */
1560:           CsrMatrix *mat   = new CsrMatrix;
1561:           mat->num_rows    = m;
1562:           mat->num_cols    = A->cmap->n;
1563:           mat->num_entries = nnz;
1564:           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY(m + 1));
1565:           mat->row_offsets->assign(ii, ii + m + 1);
1566:           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY(nnz));
1567:           mat->column_indices->assign(a->j, a->j + nnz);

1569:           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1570:           if (a->a) mat->values->assign(a->a, a->a + nnz);

1572:           /* assign the pointer */
1573:           matstruct->mat = mat;
1574:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1575:             PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1576:           }
1577:         } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
1578:           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_HIPSPARSE_ELL and MAT_HIPSPARSE_HYB are not supported");
1579:         }

1581:         /* assign the compressed row indices */
1582:         if (a->compressedrow.use) {
1583:           PetscCallCXX(hipsparsestruct->workVector = new THRUSTARRAY(m));
1584:           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1585:           matstruct->cprowIndices->assign(ridx, ridx + m);
1586:           tmp = m;
1587:         } else {
1588:           hipsparsestruct->workVector = NULL;
1589:           matstruct->cprowIndices     = NULL;
1590:           tmp                         = 0;
1591:         }
1592:         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));

1594:         /* assign the pointer */
1595:         hipsparsestruct->mat = matstruct;
1596:       } catch (char *ex) {
1597:         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
1598:       }
1599:       PetscCallHIP(WaitForHIP());
1600:       PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1601:       hipsparsestruct->nonzerostate = A->nonzerostate;
1602:     }
1603:     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1604:   }
1605:   PetscFunctionReturn(PETSC_SUCCESS);
1606: }

1608: struct VecHIPPlusEquals {
1609:   template <typename Tuple>
1610:   __host__ __device__ void operator()(Tuple t)
1611:   {
1612:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1613:   }
1614: };

1616: struct VecHIPEquals {
1617:   template <typename Tuple>
1618:   __host__ __device__ void operator()(Tuple t)
1619:   {
1620:     thrust::get<1>(t) = thrust::get<0>(t);
1621:   }
1622: };

1624: struct VecHIPEqualsReverse {
1625:   template <typename Tuple>
1626:   __host__ __device__ void operator()(Tuple t)
1627:   {
1628:     thrust::get<0>(t) = thrust::get<1>(t);
1629:   }
1630: };

1632: struct MatProductCtx_MatMatHipsparse {
1633:   PetscBool              cisdense;
1634:   PetscScalar           *Bt;
1635:   Mat                    X;
1636:   PetscBool              reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
1637:   PetscLogDouble         flops;
1638:   CsrMatrix             *Bcsr;
1639:   hipsparseSpMatDescr_t  matSpBDescr;
1640:   PetscBool              initialized; /* C = alpha op(A) op(B) + beta C */
1641:   hipsparseDnMatDescr_t  matBDescr;
1642:   hipsparseDnMatDescr_t  matCDescr;
1643:   PetscInt               Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1644:   size_t                 mmBufferSize;
1645:   void                  *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1646:   hipsparseSpGEMMDescr_t spgemmDesc;
1647: };

1649: static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
1650: {
1651:   MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;

1653:   PetscFunctionBegin;
1654:   PetscCallHIP(hipFree(mmdata->Bt));
1655:   delete mmdata->Bcsr;
1656:   if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
1657:   if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
1658:   if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
1659:   if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1660:   PetscCallHIP(hipFree(mmdata->mmBuffer));
1661:   PetscCallHIP(hipFree(mmdata->mmBuffer2));
1662:   PetscCall(MatDestroy(&mmdata->X));
1663:   PetscCall(PetscFree(*(void **)data));
1664:   PetscFunctionReturn(PETSC_SUCCESS);
1665: }

1667: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
1668: {
1669:   Mat_Product                   *product = C->product;
1670:   Mat                            A, B;
1671:   PetscInt                       m, n, blda, clda;
1672:   PetscBool                      flg, biship;
1673:   Mat_SeqAIJHIPSPARSE           *cusp;
1674:   hipsparseOperation_t           opA;
1675:   const PetscScalar             *barray;
1676:   PetscScalar                   *carray;
1677:   MatProductCtx_MatMatHipsparse *mmdata;
1678:   Mat_SeqAIJHIPSPARSEMultStruct *mat;
1679:   CsrMatrix                     *csrmat;

1681:   PetscFunctionBegin;
1682:   MatCheckProduct(C, 1);
1683:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1684:   mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
1685:   A      = product->A;
1686:   B      = product->B;
1687:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1688:   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1689:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1690:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1691:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1692:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1693:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1694:   switch (product->type) {
1695:   case MATPRODUCT_AB:
1696:   case MATPRODUCT_PtAP:
1697:     mat = cusp->mat;
1698:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1699:     m   = A->rmap->n;
1700:     n   = B->cmap->n;
1701:     break;
1702:   case MATPRODUCT_AtB:
1703:     if (!A->form_explicit_transpose) {
1704:       mat = cusp->mat;
1705:       opA = HIPSPARSE_OPERATION_TRANSPOSE;
1706:     } else {
1707:       PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
1708:       mat = cusp->matTranspose;
1709:       opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1710:     }
1711:     m = A->cmap->n;
1712:     n = B->cmap->n;
1713:     break;
1714:   case MATPRODUCT_ABt:
1715:   case MATPRODUCT_RARt:
1716:     mat = cusp->mat;
1717:     opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1718:     m   = A->rmap->n;
1719:     n   = B->rmap->n;
1720:     break;
1721:   default:
1722:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1723:   }
1724:   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
1725:   csrmat = (CsrMatrix *)mat->mat;
1726:   /* if the user passed a CPU matrix, copy the data to the GPU */
1727:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
1728:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
1729:   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
1730:   PetscCall(MatDenseGetLDA(B, &blda));
1731:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1732:     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1733:     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1734:   } else {
1735:     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1736:     PetscCall(MatDenseGetLDA(C, &clda));
1737:   }

1739:   PetscCall(PetscLogGpuTimeBegin());
1740:   hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
1741:   /* (re)allocate mmBuffer if not initialized or LDAs are different */
1742:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1743:     size_t mmBufferSize;
1744:     if (mmdata->initialized && mmdata->Blda != blda) {
1745:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
1746:       mmdata->matBDescr = NULL;
1747:     }
1748:     if (!mmdata->matBDescr) {
1749:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
1750:       mmdata->Blda = blda;
1751:     }
1752:     if (mmdata->initialized && mmdata->Clda != clda) {
1753:       PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
1754:       mmdata->matCDescr = NULL;
1755:     }
1756:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1757:       PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
1758:       mmdata->Clda = clda;
1759:     }
1760:     if (!mat->matDescr) {
1761:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1762:     }
1763:     PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
1764:     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1765:       PetscCallHIP(hipFree(mmdata->mmBuffer));
1766:       PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
1767:       mmdata->mmBufferSize = mmBufferSize;
1768:     }
1769:     mmdata->initialized = PETSC_TRUE;
1770:   } else {
1771:     /* to be safe, always update pointers of the mats */
1772:     PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
1773:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1774:     PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1775:   }

1777:   /* do hipsparseSpMM, which supports transpose on B */
1778:   PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));

1780:   PetscCall(PetscLogGpuTimeEnd());
1781:   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1782:   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1783:   if (product->type == MATPRODUCT_RARt) {
1784:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1785:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1786:   } else if (product->type == MATPRODUCT_PtAP) {
1787:     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1788:     PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1789:   } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1790:   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1791:   if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1792:   PetscFunctionReturn(PETSC_SUCCESS);
1793: }

1795: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
1796: {
1797:   Mat_Product                   *product = C->product;
1798:   Mat                            A, B;
1799:   PetscInt                       m, n;
1800:   PetscBool                      cisdense, flg;
1801:   MatProductCtx_MatMatHipsparse *mmdata;
1802:   Mat_SeqAIJHIPSPARSE           *cusp;

1804:   PetscFunctionBegin;
1805:   MatCheckProduct(C, 1);
1806:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1807:   A = product->A;
1808:   B = product->B;
1809:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1810:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1811:   cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1812:   PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1813:   switch (product->type) {
1814:   case MATPRODUCT_AB:
1815:     m = A->rmap->n;
1816:     n = B->cmap->n;
1817:     break;
1818:   case MATPRODUCT_AtB:
1819:     m = A->cmap->n;
1820:     n = B->cmap->n;
1821:     break;
1822:   case MATPRODUCT_ABt:
1823:     m = A->rmap->n;
1824:     n = B->rmap->n;
1825:     break;
1826:   case MATPRODUCT_PtAP:
1827:     m = B->cmap->n;
1828:     n = B->cmap->n;
1829:     break;
1830:   case MATPRODUCT_RARt:
1831:     m = B->rmap->n;
1832:     n = B->rmap->n;
1833:     break;
1834:   default:
1835:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1836:   }
1837:   PetscCall(MatSetSizes(C, m, n, m, n));
1838:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1839:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1840:   PetscCall(MatSetType(C, MATSEQDENSEHIP));

1842:   /* product data */
1843:   PetscCall(PetscNew(&mmdata));
1844:   mmdata->cisdense = cisdense;
1845:   /* for these products we need intermediate storage */
1846:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1847:     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1848:     PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
1849:     /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
1850:     if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1851:     else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1852:   }
1853:   C->product->data       = mmdata;
1854:   C->product->destroy    = MatProductCtxDestroy_MatMatHipsparse;
1855:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
1856:   PetscFunctionReturn(PETSC_SUCCESS);
1857: }

1859: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
1860: {
1861:   Mat_Product                   *product = C->product;
1862:   Mat                            A, B;
1863:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
1864:   Mat_SeqAIJ                    *c = (Mat_SeqAIJ *)C->data;
1865:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
1866:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
1867:   PetscBool                      flg;
1868:   MatProductType                 ptype;
1869:   MatProductCtx_MatMatHipsparse *mmdata;
1870:   hipsparseSpMatDescr_t          BmatSpDescr;
1871:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */

1873:   PetscFunctionBegin;
1874:   MatCheckProduct(C, 1);
1875:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1876:   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
1877:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1878:   mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
1879:   A      = product->A;
1880:   B      = product->B;
1881:   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1882:     mmdata->reusesym = PETSC_FALSE;
1883:     Ccusp            = (Mat_SeqAIJHIPSPARSE *)C->spptr;
1884:     PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1885:     Cmat = Ccusp->mat;
1886:     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1887:     Ccsr = (CsrMatrix *)Cmat->mat;
1888:     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1889:     goto finalize;
1890:   }
1891:   if (!c->nz) goto finalize;
1892:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1893:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1894:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
1895:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1896:   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1897:   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1898:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1899:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
1900:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
1901:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1902:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1903:   PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1904:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1905:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));

1907:   ptype = product->type;
1908:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1909:     ptype = MATPRODUCT_AB;
1910:     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1911:   }
1912:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1913:     ptype = MATPRODUCT_AB;
1914:     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1915:   }
1916:   switch (ptype) {
1917:   case MATPRODUCT_AB:
1918:     Amat = Acusp->mat;
1919:     Bmat = Bcusp->mat;
1920:     break;
1921:   case MATPRODUCT_AtB:
1922:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
1923:     Amat = Acusp->matTranspose;
1924:     Bmat = Bcusp->mat;
1925:     break;
1926:   case MATPRODUCT_ABt:
1927:     Amat = Acusp->mat;
1928:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
1929:     Bmat = Bcusp->matTranspose;
1930:     break;
1931:   default:
1932:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1933:   }
1934:   Cmat = Ccusp->mat;
1935:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1936:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1937:   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1938:   Acsr = (CsrMatrix *)Amat->mat;
1939:   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1940:   Ccsr = (CsrMatrix *)Cmat->mat;
1941:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
1942:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
1943:   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1944:   PetscCall(PetscLogGpuTimeBegin());
1945:   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
1946:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
1947:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
1948:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
1949:   PetscCall(PetscLogGpuFlops(mmdata->flops));
1950:   PetscCallHIP(WaitForHIP());
1951:   PetscCall(PetscLogGpuTimeEnd());
1952:   C->offloadmask = PETSC_OFFLOAD_GPU;
1953: finalize:
1954:   /* shorter version of MatAssemblyEnd_SeqAIJ */
1955:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
1956:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
1957:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
1958:   c->reallocs = 0;
1959:   C->info.mallocs += 0;
1960:   C->info.nz_unneeded = 0;
1961:   C->assembled = C->was_assembled = PETSC_TRUE;
1962:   C->num_ass++;
1963:   PetscFunctionReturn(PETSC_SUCCESS);
1964: }

1966: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
1967: {
1968:   Mat_Product                   *product = C->product;
1969:   Mat                            A, B;
1970:   Mat_SeqAIJHIPSPARSE           *Acusp, *Bcusp, *Ccusp;
1971:   Mat_SeqAIJ                    *a, *b, *c;
1972:   Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
1973:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
1974:   PetscInt                       i, j, m, n, k;
1975:   PetscBool                      flg;
1976:   MatProductType                 ptype;
1977:   MatProductCtx_MatMatHipsparse *mmdata;
1978:   PetscLogDouble                 flops;
1979:   PetscBool                      biscompressed, ciscompressed;
1980:   int64_t                        C_num_rows1, C_num_cols1, C_nnz1;
1981:   hipsparseSpMatDescr_t          BmatSpDescr;
1982:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* HIPSPARSE spgemm doesn't support transpose yet */
1983:   size_t                         bufSize2;

1985:   PetscFunctionBegin;
1986:   MatCheckProduct(C, 1);
1987:   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1988:   A = product->A;
1989:   B = product->B;
1990:   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1991:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1992:   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
1993:   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1994:   a = (Mat_SeqAIJ *)A->data;
1995:   b = (Mat_SeqAIJ *)B->data;
1996:   /* product data */
1997:   PetscCall(PetscNew(&mmdata));
1998:   C->product->data    = mmdata;
1999:   C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;

2001:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2002:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2003:   Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2004:   Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2005:   PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2006:   PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");

2008:   ptype = product->type;
2009:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2010:     ptype                                          = MATPRODUCT_AB;
2011:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2012:   }
2013:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2014:     ptype                                          = MATPRODUCT_AB;
2015:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2016:   }
2017:   biscompressed = PETSC_FALSE;
2018:   ciscompressed = PETSC_FALSE;
2019:   switch (ptype) {
2020:   case MATPRODUCT_AB:
2021:     m    = A->rmap->n;
2022:     n    = B->cmap->n;
2023:     k    = A->cmap->n;
2024:     Amat = Acusp->mat;
2025:     Bmat = Bcusp->mat;
2026:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2027:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2028:     break;
2029:   case MATPRODUCT_AtB:
2030:     m = A->cmap->n;
2031:     n = B->cmap->n;
2032:     k = A->rmap->n;
2033:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2034:     Amat = Acusp->matTranspose;
2035:     Bmat = Bcusp->mat;
2036:     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2037:     break;
2038:   case MATPRODUCT_ABt:
2039:     m = A->rmap->n;
2040:     n = B->rmap->n;
2041:     k = A->cmap->n;
2042:     PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2043:     Amat = Acusp->mat;
2044:     Bmat = Bcusp->matTranspose;
2045:     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2046:     break;
2047:   default:
2048:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2049:   }

2051:   /* create hipsparse matrix */
2052:   PetscCall(MatSetSizes(C, m, n, m, n));
2053:   PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2054:   c     = (Mat_SeqAIJ *)C->data;
2055:   Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2056:   Cmat  = new Mat_SeqAIJHIPSPARSEMultStruct;
2057:   Ccsr  = new CsrMatrix;

2059:   c->compressedrow.use = ciscompressed;
2060:   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2061:     c->compressedrow.nrows = a->compressedrow.nrows;
2062:     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2063:     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2064:     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2065:     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2066:     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2067:   } else {
2068:     c->compressedrow.nrows  = 0;
2069:     c->compressedrow.i      = NULL;
2070:     c->compressedrow.rindex = NULL;
2071:     Ccusp->workVector       = NULL;
2072:     Cmat->cprowIndices      = NULL;
2073:   }
2074:   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2075:   Ccusp->mat        = Cmat;
2076:   Ccusp->mat->mat   = Ccsr;
2077:   Ccsr->num_rows    = Ccusp->nrows;
2078:   Ccsr->num_cols    = n;
2079:   Ccsr->row_offsets = new THRUSTINTARRAY(Ccusp->nrows + 1);
2080:   PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2081:   PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2082:   PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2083:   PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2084:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2085:   PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2086:   PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2087:   PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2088:   PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2089:   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2090:     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2091:     c->nz                = 0;
2092:     Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2093:     Ccsr->values         = new THRUSTARRAY(c->nz);
2094:     goto finalizesym;
2095:   }

2097:   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2098:   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2099:   Acsr = (CsrMatrix *)Amat->mat;
2100:   if (!biscompressed) {
2101:     Bcsr        = (CsrMatrix *)Bmat->mat;
2102:     BmatSpDescr = Bmat->matDescr;
2103:   } else { /* we need to use row offsets for the full matrix */
2104:     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2105:     Bcsr                 = new CsrMatrix;
2106:     Bcsr->num_rows       = B->rmap->n;
2107:     Bcsr->num_cols       = cBcsr->num_cols;
2108:     Bcsr->num_entries    = cBcsr->num_entries;
2109:     Bcsr->column_indices = cBcsr->column_indices;
2110:     Bcsr->values         = cBcsr->values;
2111:     if (!Bcusp->rowoffsets_gpu) {
2112:       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
2113:       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2114:       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2115:     }
2116:     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2117:     mmdata->Bcsr      = Bcsr;
2118:     if (Bcsr->num_rows && Bcsr->num_cols) {
2119:       PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2120:     }
2121:     BmatSpDescr = mmdata->matSpBDescr;
2122:   }
2123:   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2124:   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2125:   /* precompute flops count */
2126:   if (ptype == MATPRODUCT_AB) {
2127:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2128:       const PetscInt st = a->i[i];
2129:       const PetscInt en = a->i[i + 1];
2130:       for (j = st; j < en; j++) {
2131:         const PetscInt brow = a->j[j];
2132:         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2133:       }
2134:     }
2135:   } else if (ptype == MATPRODUCT_AtB) {
2136:     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2137:       const PetscInt anzi = a->i[i + 1] - a->i[i];
2138:       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2139:       flops += (2. * anzi) * bnzi;
2140:     }
2141:   } else flops = 0.; /* TODO */

2143:   mmdata->flops = flops;
2144:   PetscCall(PetscLogGpuTimeBegin());

2146:   PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2147:   // cuda-12.2 requires non-null csrRowOffsets
2148:   PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2149:   PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2150:   // Note that cusparseSpGEMMreuse is deprecated in CUDA 13.2.1

2152:   /* ask bufferSize bytes for external memory */
2153:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2154:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2155:   /* inspect the matrices A and B to understand the memory requirement for the next step */
2156:   PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2157:   /* ask bufferSize again bytes for external memory */
2158:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2159:   /* The HIPSPARSE documentation is not clear, nor the API
2160:      We need both buffers to perform the operations properly!
2161:      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2162:      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2163:      is stored in the descriptor! What a messy API... */
2164:   PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2165:   /* compute the intermediate product of A * B */
2166:   PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2167:   /* get matrix C non-zero entries C_nnz1 */
2168:   PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2169:   PetscCall(PetscIntCast(C_nnz1, &c->nz));
2170:   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2171:                       mmdata->mmBufferSize / 1024));
2172:   Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2173:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2174:   Ccsr->values = new THRUSTARRAY(c->nz);
2175:   PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2176:   // hipSparse errors with null pointers even with nz = 0
2177:   if (c->nz) PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2178:   PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2179:   PetscCall(PetscLogGpuFlops(mmdata->flops));
2180:   PetscCall(PetscLogGpuTimeEnd());
2181: finalizesym:
2182:   c->free_a = PETSC_TRUE;
2183:   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2184:   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2185:   c->free_ij = PETSC_TRUE;

2187:   PetscInt *d_i = c->i;
2188:   if (ciscompressed) d_i = c->compressedrow.i;
2189:   PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2190:   PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2191:   if (ciscompressed) { /* need to expand host row offsets */
2192:     PetscInt r = 0;
2193:     c->i[0]    = 0;
2194:     for (k = 0; k < c->compressedrow.nrows; k++) {
2195:       const PetscInt next = c->compressedrow.rindex[k];
2196:       const PetscInt old  = c->compressedrow.i[k];
2197:       for (; r < next; r++) c->i[r + 1] = old;
2198:     }
2199:     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2200:   }
2201:   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2202:   PetscCall(PetscMalloc1(m, &c->ilen));
2203:   PetscCall(PetscMalloc1(m, &c->imax));
2204:   c->maxnz         = c->nz;
2205:   c->nonzerorowcnt = 0;
2206:   c->rmax          = 0;
2207:   for (k = 0; k < m; k++) {
2208:     const PetscInt nn = c->i[k + 1] - c->i[k];
2209:     c->ilen[k] = c->imax[k] = nn;
2210:     c->nonzerorowcnt += (PetscInt)!!nn;
2211:     c->rmax = PetscMax(c->rmax, nn);
2212:   }
2213:   PetscCall(PetscMalloc1(c->nz, &c->a));
2214:   Ccsr->num_entries = c->nz;

2216:   C->nonzerostate++;
2217:   PetscCall(PetscLayoutSetUp(C->rmap));
2218:   PetscCall(PetscLayoutSetUp(C->cmap));
2219:   Ccusp->nonzerostate = C->nonzerostate;
2220:   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
2221:   C->preallocated     = PETSC_TRUE;
2222:   C->assembled        = PETSC_FALSE;
2223:   C->was_assembled    = PETSC_FALSE;
2224:   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2225:     mmdata->reusesym = PETSC_TRUE;
2226:     C->offloadmask   = PETSC_OFFLOAD_GPU;
2227:   }
2228:   C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2229:   PetscFunctionReturn(PETSC_SUCCESS);
2230: }

2232: /* handles sparse or dense B */
2233: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2234: {
2235:   Mat_Product *product = mat->product;
2236:   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;

2238:   PetscFunctionBegin;
2239:   MatCheckProduct(mat, 1);
2240:   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2241:   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2242:   if (product->type == MATPRODUCT_ABC) {
2243:     Ciscusp = PETSC_FALSE;
2244:     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2245:   }
2246:   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2247:     PetscBool usecpu = PETSC_FALSE;
2248:     switch (product->type) {
2249:     case MATPRODUCT_AB:
2250:       if (product->api_user) {
2251:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2252:         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2253:         PetscOptionsEnd();
2254:       } else {
2255:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2256:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2257:         PetscOptionsEnd();
2258:       }
2259:       break;
2260:     case MATPRODUCT_AtB:
2261:       if (product->api_user) {
2262:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2263:         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2264:         PetscOptionsEnd();
2265:       } else {
2266:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2267:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2268:         PetscOptionsEnd();
2269:       }
2270:       break;
2271:     case MATPRODUCT_PtAP:
2272:       if (product->api_user) {
2273:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2274:         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2275:         PetscOptionsEnd();
2276:       } else {
2277:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2278:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2279:         PetscOptionsEnd();
2280:       }
2281:       break;
2282:     case MATPRODUCT_RARt:
2283:       if (product->api_user) {
2284:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2285:         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2286:         PetscOptionsEnd();
2287:       } else {
2288:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2289:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2290:         PetscOptionsEnd();
2291:       }
2292:       break;
2293:     case MATPRODUCT_ABC:
2294:       if (product->api_user) {
2295:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2296:         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2297:         PetscOptionsEnd();
2298:       } else {
2299:         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2300:         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2301:         PetscOptionsEnd();
2302:       }
2303:       break;
2304:     default:
2305:       break;
2306:     }
2307:     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2308:   }
2309:   /* dispatch */
2310:   if (isdense) {
2311:     switch (product->type) {
2312:     case MATPRODUCT_AB:
2313:     case MATPRODUCT_AtB:
2314:     case MATPRODUCT_ABt:
2315:     case MATPRODUCT_PtAP:
2316:     case MATPRODUCT_RARt:
2317:       if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2318:       else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2319:       break;
2320:     case MATPRODUCT_ABC:
2321:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2322:       break;
2323:     default:
2324:       break;
2325:     }
2326:   } else if (Biscusp && Ciscusp) {
2327:     switch (product->type) {
2328:     case MATPRODUCT_AB:
2329:     case MATPRODUCT_AtB:
2330:     case MATPRODUCT_ABt:
2331:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2332:       break;
2333:     case MATPRODUCT_PtAP:
2334:     case MATPRODUCT_RARt:
2335:     case MATPRODUCT_ABC:
2336:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2337:       break;
2338:     default:
2339:       break;
2340:     }
2341:   } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2342:   PetscFunctionReturn(PETSC_SUCCESS);
2343: }

2345: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2346: {
2347:   PetscFunctionBegin;
2348:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2349:   PetscFunctionReturn(PETSC_SUCCESS);
2350: }

2352: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2353: {
2354:   PetscFunctionBegin;
2355:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2356:   PetscFunctionReturn(PETSC_SUCCESS);
2357: }

2359: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2360: {
2361:   PetscFunctionBegin;
2362:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2363:   PetscFunctionReturn(PETSC_SUCCESS);
2364: }

2366: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2367: {
2368:   PetscFunctionBegin;
2369:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2370:   PetscFunctionReturn(PETSC_SUCCESS);
2371: }

2373: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2374: {
2375:   PetscFunctionBegin;
2376:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2377:   PetscFunctionReturn(PETSC_SUCCESS);
2378: }

2380: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2381: {
2382:   int i = blockIdx.x * blockDim.x + threadIdx.x;
2383:   if (i < n) y[idx[i]] += x[i];
2384: }

2386: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2387: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2388: {
2389:   Mat_SeqAIJ                    *a               = (Mat_SeqAIJ *)A->data;
2390:   Mat_SeqAIJHIPSPARSE           *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2391:   Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
2392:   PetscScalar                   *xarray, *zarray, *dptr, *beta, *xptr;
2393:   hipsparseOperation_t           opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2394:   PetscBool                      compressed;
2395:   PetscInt                       nx, ny;

2397:   PetscFunctionBegin;
2398:   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2399:   if (!a->nz) {
2400:     if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
2401:     else PetscCall(VecSeq_HIP::Set(zz, 0));
2402:     PetscFunctionReturn(PETSC_SUCCESS);
2403:   }
2404:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2405:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2406:   if (!trans) {
2407:     matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2408:     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
2409:   } else {
2410:     if (herm || !A->form_explicit_transpose) {
2411:       opA       = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
2412:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2413:     } else {
2414:       if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2415:       matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
2416:     }
2417:   }
2418:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2419:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2420:   try {
2421:     PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
2422:     if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2423:     else PetscCall(VecHIPGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */

2425:     PetscCall(PetscLogGpuTimeBegin());
2426:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
2427:       /* z = A x + beta y.
2428:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2429:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2430:       */
2431:       xptr = xarray;
2432:       dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
2433:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2434:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2435:           allocated to accommodate different uses. So we get the length info directly from mat.
2436:        */
2437:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2438:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2439:         nx             = mat->num_cols;
2440:         ny             = mat->num_rows;
2441:       }
2442:     } else {
2443:       /* z = A^T x + beta y
2444:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2445:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2446:        */
2447:       xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
2448:       dptr = zarray;
2449:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2450:       if (compressed) { /* Scatter x to work vector */
2451:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2452:         thrust::for_each(
2453: #if PetscDefined(HAVE_THRUST_ASYNC)
2454:           thrust::hip::par.on(PetscDefaultHipStream),
2455: #endif
2456:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2457:           thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
2458:       }
2459:       if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2460:         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2461:         nx             = mat->num_rows;
2462:         ny             = mat->num_cols;
2463:       }
2464:     }
2465:     /* csr_spmv does y = alpha op(A) x + beta y */
2466:     if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2467: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !(PETSC_PKG_HIP_VERSION_GT(6, 4, 3) && PETSC_PKG_HIP_VERSION_LE(7, 2, 0))
2468:       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
2469:       if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
2470:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
2471:         PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
2472:         PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
2473:                                                     &matstruct->hipSpMV[opA].spmvBufferSize));
2474:         PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
2475:         matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
2476:       } else {
2477:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2478:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
2479:         PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
2480:       }
2481:       PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
2482:                                        matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
2483: #else
2484:       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2485:       nx             = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
2486:       ny             = mat->num_cols;
2487:       PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
2488: #endif
2489:     } else {
2490:       if (hipsparsestruct->nrows) {
2491:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
2492:         PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
2493:       }
2494:     }
2495:     PetscCall(PetscLogGpuTimeEnd());

2497:     if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
2498:       if (yy) {                                     /* MatMultAdd: zz = A*xx + yy */
2499:         if (compressed) {                           /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2500:           PetscCall(VecSeq_HIP::Copy(yy, zz));      /* zz = yy */
2501:         } else if (zz != yy) {                      /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2502:           PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
2503:         }
2504:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2505:         PetscCall(VecSeq_HIP::Set(zz, 0));
2506:       }

2508:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2509:       if (compressed) {
2510:         PetscCall(PetscLogGpuTimeBegin());
2511:         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
2512:            and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
2513:            prevent that. So I just add a ScatterAdd kernel.
2514:          */
2515: #if 0
2516:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2517:         thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
2518:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2519:                          thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2520:                          VecHIPPlusEquals());
2521: #else
2522:         PetscInt n = matstruct->cprowIndices->size();
2523:         hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
2524: #endif
2525:         PetscCall(PetscLogGpuTimeEnd());
2526:       }
2527:     } else {
2528:       if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
2529:     }
2530:     PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
2531:     if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
2532:     else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
2533:   } catch (char *ex) {
2534:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2535:   }
2536:   if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2537:   else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2538:   PetscFunctionReturn(PETSC_SUCCESS);
2539: }

2541: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2542: {
2543:   PetscFunctionBegin;
2544:   PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2545:   PetscFunctionReturn(PETSC_SUCCESS);
2546: }

2548: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
2549: {
2550:   PetscFunctionBegin;
2551:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AssemblyEnd(A, mode));
2552:   PetscFunctionReturn(PETSC_SUCCESS);
2553: }

2555: /*@
2556:   MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
2557:   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.

2559:   Collective

2561:   Input Parameters:
2562: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2563: . m    - number of rows
2564: . n    - number of columns
2565: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is set
2566: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

2568:   Output Parameter:
2569: . A - the matrix

2571:   Level: intermediate

2573:   Notes:
2574:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2575:   `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
2576:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]

2578:   The AIJ format (compressed row storage), is fully compatible with standard Fortran
2579:   storage.  That is, the stored row and column indices can begin at
2580:   either one (as in Fortran) or zero.

2582:   Specify the preallocated storage with either `nz` or `nnz` (not both).
2583:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2584:   allocation.

2586: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
2587: @*/
2588: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2589: {
2590:   return MatSeqAIJHIPSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2591: }

2593: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
2594: {
2595:   return MatSeqAIJHIPSPARSE_CUPM_t::Destroy(A);
2596: }

2598: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2599: {
2600:   PetscFunctionBegin;
2601:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2602:   PetscFunctionReturn(PETSC_SUCCESS);
2603: }

2605: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2606: {
2607:   Mat_SeqAIJ          *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2608:   Mat_SeqAIJHIPSPARSE *cy;
2609:   Mat_SeqAIJHIPSPARSE *cx;
2610:   CsrMatrix           *csry, *csrx;

2612:   PetscFunctionBegin;
2613:   cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
2614:   cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
2615:   if (X->ops->axpy != Y->ops->axpy) {
2616:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2617:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2618:     PetscFunctionReturn(PETSC_SUCCESS);
2619:   }
2620:   /* if we are here, it means both matrices are bound to GPU */
2621:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
2622:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
2623:   PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
2624:   PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
2625:   csry = (CsrMatrix *)cy->mat->mat;
2626:   csrx = (CsrMatrix *)cx->mat->mat;
2627:   /* see if we can turn this into a cublas axpy */
2628:   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2629:     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2630:     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2631:     if (eq) str = SAME_NONZERO_PATTERN;
2632:   }
2633:   /* spgeam is buggy with one column */
2634:   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;

2636: #if !defined(PETSC_USE_64BIT_INDICES) // hipsparseScsrgeam2 etc. do not support 64bit indices
2637:   if (str == SUBSET_NONZERO_PATTERN) {
2638:     PetscScalar       *ay, b = 1.0;
2639:     const PetscScalar *ax;
2640:     size_t             bufferSize;
2641:     void              *buffer;

2643:     PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
2644:     PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
2645:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
2646:     PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2647:                                                        csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2648:     PetscCallHIP(hipMalloc(&buffer, bufferSize));
2649:     PetscCall(PetscLogGpuTimeBegin());
2650:     PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2651:                                             csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2652:     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2653:     PetscCall(PetscLogGpuTimeEnd());
2654:     PetscCallHIP(hipFree(buffer));

2656:     PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2657:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
2658:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
2659:   } else
2660: #endif
2661:     if (str == SAME_NONZERO_PATTERN) {
2662:     PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2663:   } else {
2664:     PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2665:     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2666:   }
2667:   PetscFunctionReturn(PETSC_SUCCESS);
2668: }

2670: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
2671: {
2672:   PetscFunctionBegin;
2673:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Scale(Y, a));
2674:   PetscFunctionReturn(PETSC_SUCCESS);
2675: }

2677: static PetscErrorCode MatGetDiagonal_SeqAIJHIPSPARSE(Mat A, Vec diag)
2678: {
2679:   PetscFunctionBegin;
2680:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetDiagonal(A, diag));
2681:   PetscFunctionReturn(PETSC_SUCCESS);
2682: }

2684: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat A, Vec ll, Vec rr)
2685: {
2686:   PetscFunctionBegin;
2687:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2688:   PetscFunctionReturn(PETSC_SUCCESS);
2689: }

2691: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
2692: {
2693:   PetscFunctionBegin;
2694:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::ZeroEntries(A));
2695:   PetscFunctionReturn(PETSC_SUCCESS);
2696: }

2698: static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(Mat A, PetscMemType *m)
2699: {
2700:   PetscFunctionBegin;
2701:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetCurrentMemType(A, m));
2702:   PetscFunctionReturn(PETSC_SUCCESS);
2703: }

2705: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
2706: {
2707:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

2709:   PetscFunctionBegin;
2710:   if (A->factortype != MAT_FACTOR_NONE) {
2711:     A->boundtocpu = flg;
2712:     PetscFunctionReturn(PETSC_SUCCESS);
2713:   }
2714:   if (flg) {
2715:     PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));

2717:     A->ops->scale                     = MatScale_SeqAIJ;
2718:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJ;
2719:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJ;
2720:     A->ops->axpy                      = MatAXPY_SeqAIJ;
2721:     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2722:     A->ops->mult                      = MatMult_SeqAIJ;
2723:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2724:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2725:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2726:     A->ops->multhermitiantranspose    = NULL;
2727:     A->ops->multhermitiantransposeadd = NULL;
2728:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
2729:     A->ops->getcurrentmemtype         = NULL;
2730:     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2731:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2732:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
2733:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
2734:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2735:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2736:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
2737:   } else {
2738:     A->ops->scale                     = MatScale_SeqAIJHIPSPARSE;
2739:     A->ops->getdiagonal               = MatGetDiagonal_SeqAIJHIPSPARSE;
2740:     A->ops->diagonalscale             = MatDiagonalScale_SeqAIJHIPSPARSE;
2741:     A->ops->axpy                      = MatAXPY_SeqAIJHIPSPARSE;
2742:     A->ops->zeroentries               = MatZeroEntries_SeqAIJHIPSPARSE;
2743:     A->ops->mult                      = MatMult_SeqAIJHIPSPARSE;
2744:     A->ops->multadd                   = MatMultAdd_SeqAIJHIPSPARSE;
2745:     A->ops->multtranspose             = MatMultTranspose_SeqAIJHIPSPARSE;
2746:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJHIPSPARSE;
2747:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
2748:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
2749:     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJHIPSPARSE;
2750:     A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJHIPSPARSE;
2751:     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
2752:     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
2753:     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
2754:     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
2755:     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
2756:     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
2757:     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;

2759:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
2760:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2761:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2762:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
2763:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
2764:     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2765:   }
2766:   A->boundtocpu = flg;
2767:   a->inode.use  = (flg && a->inode.size_csr) ? PETSC_TRUE : PETSC_FALSE;
2768:   PetscFunctionReturn(PETSC_SUCCESS);
2769: }

2771: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
2772: {
2773:   Mat B;

2775:   PetscFunctionBegin;
2776:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
2777:   if (reuse == MAT_INITIAL_MATRIX) {
2778:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2779:   } else if (reuse == MAT_REUSE_MATRIX) {
2780:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2781:   }
2782:   B = *newmat;
2783:   PetscCall(PetscFree(B->defaultvectype));
2784:   PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
2785:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2786:     if (B->factortype == MAT_FACTOR_NONE) {
2787:       Mat_SeqAIJHIPSPARSE *spptr;
2788:       PetscCall(PetscNew(&spptr));
2789:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
2790:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
2791:       spptr->format  = MAT_HIPSPARSE_CSR;
2792:       spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
2793:       spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2794:       //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;

2796:       B->spptr = spptr;
2797:     } else {
2798:       Mat_SeqAIJHIPSPARSETriFactors *spptr;

2800:       PetscCall(PetscNew(&spptr));
2801:       PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
2802:       PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
2803:       B->spptr = spptr;
2804:     }
2805:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2806:   }
2807:   B->ops->assemblyend       = MatAssemblyEnd_SeqAIJHIPSPARSE;
2808:   B->ops->destroy           = MatDestroy_SeqAIJHIPSPARSE;
2809:   B->ops->setoption         = MatSetOption_SeqAIJHIPSPARSE;
2810:   B->ops->setfromoptions    = MatSetFromOptions_SeqAIJHIPSPARSE;
2811:   B->ops->bindtocpu         = MatBindToCPU_SeqAIJHIPSPARSE;
2812:   B->ops->duplicate         = MatDuplicate_SeqAIJHIPSPARSE;
2813:   B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;

2815:   PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
2816:   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
2817:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
2818: #if defined(PETSC_HAVE_HYPRE)
2819:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
2820: #endif
2821:   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
2822:   PetscFunctionReturn(PETSC_SUCCESS);
2823: }

2825: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
2826: {
2827:   PetscFunctionBegin;
2828:   PetscCall(MatCreate_SeqAIJ(B));
2829:   PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
2830:   PetscFunctionReturn(PETSC_SUCCESS);
2831: }

2833: /*MC
2834:    MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs

2836:    A matrix type whose data resides on AMD GPUs. These matrices can be in either
2837:    CSR, ELL, or Hybrid format.
2838:    All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.

2840:    Options Database Keys:
2841: +  -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
2842: .  -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2843:                                        Other options include ell (ellpack) or hyb (hybrid).
2844: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2845: -  -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU

2847:   Level: beginner

2849: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
2850: M*/

2852: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
2853: {
2854:   PetscFunctionBegin;
2855:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
2856:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
2857:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
2858:   PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
2859:   PetscFunctionReturn(PETSC_SUCCESS);
2860: }

2862: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
2863: {
2864:   Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);

2866:   PetscFunctionBegin;
2867:   if (cusp) {
2868:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2869:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2870:     delete cusp->workVector;
2871:     delete cusp->rowoffsets_gpu;
2872:     delete cusp->csr2csc_i;
2873:     delete cusp->coords;
2874:     if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
2875:     PetscCall(PetscFree(mat->spptr));
2876:   }
2877:   PetscFunctionReturn(PETSC_SUCCESS);
2878: }

2880: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2881: {
2882:   PetscFunctionBegin;
2883:   if (*mat) {
2884:     delete (*mat)->values;
2885:     delete (*mat)->column_indices;
2886:     delete (*mat)->row_offsets;
2887:     delete *mat;
2888:     *mat = 0;
2889:   }
2890:   PetscFunctionReturn(PETSC_SUCCESS);
2891: }

2893: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
2894: {
2895:   CsrMatrix *mat;

2897:   PetscFunctionBegin;
2898:   if (*matstruct) {
2899:     if ((*matstruct)->mat) {
2900:       if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
2901:         hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
2902:         PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
2903:       } else {
2904:         mat = (CsrMatrix *)(*matstruct)->mat;
2905:         PetscCall(CsrMatrix_Destroy(&mat));
2906:       }
2907:     }
2908:     if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
2909:     delete (*matstruct)->cprowIndices;
2910:     PetscCallHIP(hipFree((*matstruct)->alpha_one));
2911:     PetscCallHIP(hipFree((*matstruct)->beta_zero));
2912:     PetscCallHIP(hipFree((*matstruct)->beta_one));

2914:     Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
2915:     if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
2916:     for (int i = 0; i < 3; i++) {
2917:       if (mdata->hipSpMV[i].initialized) {
2918:         PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
2919:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
2920:         PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
2921:       }
2922:     }
2923:     delete *matstruct;
2924:     *matstruct = NULL;
2925:   }
2926:   PetscFunctionReturn(PETSC_SUCCESS);
2927: }

2929: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
2930: {
2931:   Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;

2933:   PetscFunctionBegin;
2934:   if (fs) {
2935:     delete fs->rpermIndices;
2936:     delete fs->cpermIndices;
2937:     fs->rpermIndices  = NULL;
2938:     fs->cpermIndices  = NULL;
2939:     fs->init_dev_prop = PETSC_FALSE;
2940:     PetscCallHIP(hipFree(fs->csrRowPtr));
2941:     PetscCallHIP(hipFree(fs->csrColIdx));
2942:     PetscCallHIP(hipFree(fs->csrRowPtr32));
2943:     PetscCallHIP(hipFree(fs->csrColIdx32));
2944:     PetscCallHIP(hipFree(fs->csrVal));
2945:     PetscCallHIP(hipFree(fs->diag));
2946:     PetscCallHIP(hipFree(fs->X));
2947:     PetscCallHIP(hipFree(fs->Y));
2948:     // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
2949:     PetscCallHIP(hipFree(fs->spsvBuffer_L));
2950:     PetscCallHIP(hipFree(fs->spsvBuffer_U));
2951:     PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
2952:     PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
2953:     PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
2954:     if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
2955:     if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
2956:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
2957:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
2958:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
2959:     PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
2960:     if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
2961:     if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
2962:     PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
2963:     PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
2964:     PetscCall(PetscFree(fs->csrRowPtr_h));
2965:     PetscCall(PetscFree(fs->csrVal_h));
2966:     PetscCall(PetscFree(fs->diag_h));
2967:     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
2968:     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
2969:   }
2970:   PetscFunctionReturn(PETSC_SUCCESS);
2971: }

2973: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
2974: {
2975:   PetscFunctionBegin;
2976:   if (*trifactors) {
2977:     PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
2978:     PetscCallHIPSPARSE(hipsparseDestroy((*trifactors)->handle));
2979:     PetscCall(PetscFree(*trifactors));
2980:   }
2981:   PetscFunctionReturn(PETSC_SUCCESS);
2982: }

2984: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
2985: {
2986:   Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;

2988:   PetscFunctionBegin;
2989:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
2990:   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
2991:   if (destroy) {
2992:     PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2993:     delete cusp->csr2csc_i;
2994:     cusp->csr2csc_i = NULL;
2995:   }
2996:   A->transupdated = PETSC_FALSE;
2997:   PetscFunctionReturn(PETSC_SUCCESS);
2998: }

3000: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3001: {
3002:   PetscFunctionBegin;
3003:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3004:   PetscFunctionReturn(PETSC_SUCCESS);
3005: }

3007: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3008: {
3009:   PetscFunctionBegin;
3010:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3011:   PetscFunctionReturn(PETSC_SUCCESS);
3012: }

3014: /*@C
3015:   MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.

3017:   Not Collective

3019:   Input Parameters:
3020: + A          - the matrix
3021: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form

3023:   Output Parameters:
3024: + i - the CSR row pointers
3025: - j - the CSR column indices

3027:   Level: developer

3029:   Note:
3030:   When compressed is true, the CSR structure does not contain empty rows

3032: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3033: @*/
3034: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3035: {
3036:   PetscFunctionBegin;
3037:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3038:   PetscFunctionReturn(PETSC_SUCCESS);
3039: }

3041: /*@C
3042:   MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`

3044:   Not Collective

3046:   Input Parameters:
3047: + A          - the matrix
3048: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3049: . i          - the CSR row pointers
3050: - j          - the CSR column indices

3052:   Level: developer

3054: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3055: @*/
3056: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3057: {
3058:   PetscFunctionBegin;
3059:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3060:   PetscFunctionReturn(PETSC_SUCCESS);
3061: }

3063: /*@C
3064:   MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3066:   Not Collective

3068:   Input Parameter:
3069: . A - a `MATSEQAIJHIPSPARSE` matrix

3071:   Output Parameter:
3072: . a - pointer to the device data

3074:   Level: developer

3076:   Note:
3077:   May trigger host-device copies if the up-to-date matrix data is on host

3079: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3080: @*/
3081: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3082: {
3083:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayRead(A, a);
3084: }

3086: /*@C
3087:   MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`

3089:   Not Collective

3091:   Input Parameters:
3092: + A - a `MATSEQAIJHIPSPARSE` matrix
3093: - a - pointer to the device data

3095:   Level: developer

3097: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3098: @*/
3099: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3100: {
3101:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayRead(A, a);
3102: }

3104: /*@C
3105:   MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3107:   Not Collective

3109:   Input Parameter:
3110: . A - a `MATSEQAIJHIPSPARSE` matrix

3112:   Output Parameter:
3113: . a - pointer to the device data

3115:   Level: developer

3117:   Note:
3118:   May trigger host-device copies if up-to-date matrix data is on host

3120: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3121: @*/
3122: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3123: {
3124:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArray(A, a);
3125: }
3126: /*@C
3127:   MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`

3129:   Not Collective

3131:   Input Parameters:
3132: + A - a `MATSEQAIJHIPSPARSE` matrix
3133: - a - pointer to the device data

3135:   Level: developer

3137: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
3138: @*/
3139: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
3140: {
3141:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArray(A, a);
3142: }

3144: /*@C
3145:   MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored

3147:   Not Collective

3149:   Input Parameter:
3150: . A - a `MATSEQAIJHIPSPARSE` matrix

3152:   Output Parameter:
3153: . a - pointer to the device data

3155:   Level: developer

3157:   Note:
3158:   Does not trigger host-device copies and flags data validity on the GPU

3160: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
3161: @*/
3162: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
3163: {
3164:   return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayWrite(A, a);
3165: }

3167: /*@C
3168:   MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`

3170:   Not Collective

3172:   Input Parameters:
3173: + A - a `MATSEQAIJHIPSPARSE` matrix
3174: - a - pointer to the device data

3176:   Level: developer

3178: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
3179: @*/
3180: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
3181: {
3182:   return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3183: }

3185: struct IJCompare4 {
3186:   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3187:   {
3188:     if (t1.get<0>() < t2.get<0>()) return true;
3189:     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3190:     return false;
3191:   }
3192: };

3194: struct Shift {
3195:   int _shift;

3197:   Shift(int shift) : _shift(shift) { }
3198:   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
3199: };

3201: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3202: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3203: {
3204:   Mat_SeqAIJ                    *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3205:   Mat_SeqAIJHIPSPARSE           *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
3206:   Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
3207:   CsrMatrix                     *Acsr, *Bcsr, *Ccsr;
3208:   PetscInt                       Annz, Bnnz;
3209:   PetscInt                       i, m, n, zero = 0;

3211:   PetscFunctionBegin;
3214:   PetscAssertPointer(C, 4);
3215:   PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3216:   PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
3217:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3218:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3219:   PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3220:   PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3221:   if (reuse == MAT_INITIAL_MATRIX) {
3222:     m = A->rmap->n;
3223:     n = A->cmap->n + B->cmap->n;
3224:     PetscCall(MatCreate(PETSC_COMM_SELF, C));
3225:     PetscCall(MatSetSizes(*C, m, n, m, n));
3226:     PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
3227:     c                       = (Mat_SeqAIJ *)(*C)->data;
3228:     Ccusp                   = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
3229:     Cmat                    = new Mat_SeqAIJHIPSPARSEMultStruct;
3230:     Ccsr                    = new CsrMatrix;
3231:     Cmat->cprowIndices      = NULL;
3232:     c->compressedrow.use    = PETSC_FALSE;
3233:     c->compressedrow.nrows  = 0;
3234:     c->compressedrow.i      = NULL;
3235:     c->compressedrow.rindex = NULL;
3236:     Ccusp->workVector       = NULL;
3237:     Ccusp->nrows            = m;
3238:     Ccusp->mat              = Cmat;
3239:     Ccusp->mat->mat         = Ccsr;
3240:     Ccsr->num_rows          = m;
3241:     Ccsr->num_cols          = n;
3242:     PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
3243:     PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
3244:     PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
3245:     PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3246:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3247:     PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3248:     PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3249:     PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
3250:     PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3251:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3252:     PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
3253:     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3254:     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");

3256:     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
3257:     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
3258:     Annz                 = (PetscInt)Acsr->column_indices->size();
3259:     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
3260:     c->nz                = Annz + Bnnz;
3261:     Ccsr->row_offsets    = new THRUSTINTARRAY(m + 1);
3262:     Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
3263:     Ccsr->values         = new THRUSTARRAY(c->nz);
3264:     Ccsr->num_entries    = c->nz;
3265:     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
3266:     if (c->nz) {
3267:       auto            Acoo = new THRUSTINTARRAY(Annz); // initialized with zeros
3268:       auto            Bcoo = new THRUSTINTARRAY(Bnnz);
3269:       auto            Ccoo = new THRUSTINTARRAY(c->nz);
3270:       THRUSTINTARRAY *Aroff, *Broff;

3272:       if (a->compressedrow.use) { /* need full row offset */
3273:         if (!Acusp->rowoffsets_gpu) {
3274:           Acusp->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
3275:           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3276:           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3277:         }
3278:         Aroff = Acusp->rowoffsets_gpu;
3279:       } else Aroff = Acsr->row_offsets;
3280:       if (b->compressedrow.use) { /* need full row offset */
3281:         if (!Bcusp->rowoffsets_gpu) {
3282:           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
3283:           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3284:           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3285:         }
3286:         Broff = Bcusp->rowoffsets_gpu;
3287:       } else Broff = Bcsr->row_offsets;
3288:       PetscCall(PetscLogGpuTimeBegin());
3289:       // Implement cusparseXcsr2coo() with Thrust, as the former doesn't support 64-bit indices.
3290:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Aroff->data().get(), Acoo->data().get())));
3291:       PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Broff->data().get(), Bcoo->data().get())));

3293:       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3294:       auto Aperm = thrust::make_constant_iterator(1);
3295:       auto Bperm = thrust::make_constant_iterator(0);
3296:       auto Bcib  = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3297:       auto Bcie  = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3298:       auto wPerm = new THRUSTINTARRAY(Annz + Bnnz);
3299:       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3300:       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3301:       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); // Use B column indices shifted by A->cmap->n
3302:       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3303:       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3304:       auto p1    = Ccusp->coords->begin();
3305:       auto p2    = Ccusp->coords->begin();
3306:       thrust::advance(p2, Annz);
3307:       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); // put nonzeros in A and B to C in sorted order (by row and then by column)
3308:       auto cci  = thrust::make_counting_iterator(zero);
3309:       auto cce  = thrust::make_counting_iterator(c->nz);
3310:       auto pred = [](const int &x) { return x; };
3311:       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3312:       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3313:       // Implement a simplified hipsparseXcoo2csr() with Thrust (assuming the row indices are already sorted), as the former doesn't support 64-bit indices.
3314:       PetscCallThrust(thrust::lower_bound(thrust::device, Ccoo->begin(), Ccoo->end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m + 1), Ccsr->row_offsets->begin()));
3315:       PetscCall(PetscLogGpuTimeEnd());
3316:       delete wPerm;
3317:       delete Acoo;
3318:       delete Bcoo;
3319:       delete Ccoo;
3320:       PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
3321:       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3322:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3323:         PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
3324:         PetscBool                      AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3325:         Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
3326:         CsrMatrix                     *CcsrT = new CsrMatrix;
3327:         CsrMatrix                     *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3328:         CsrMatrix                     *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;

3330:         (*C)->form_explicit_transpose = PETSC_TRUE;
3331:         (*C)->transupdated            = PETSC_TRUE;
3332:         Ccusp->rowoffsets_gpu         = NULL;
3333:         CmatT->cprowIndices           = NULL;
3334:         CmatT->mat                    = CcsrT;
3335:         CcsrT->num_rows               = n;
3336:         CcsrT->num_cols               = m;
3337:         CcsrT->num_entries            = c->nz;

3339:         CcsrT->row_offsets    = new THRUSTINTARRAY(n + 1);
3340:         CcsrT->column_indices = new THRUSTINTARRAY(c->nz);
3341:         CcsrT->values         = new THRUSTARRAY(c->nz);

3343:         PetscCall(PetscLogGpuTimeBegin());
3344:         auto rT = CcsrT->row_offsets->begin();
3345:         if (AT) {
3346:           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3347:           thrust::advance(rT, -1);
3348:         }
3349:         if (BT) {
3350:           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3351:           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3352:           thrust::copy(titb, tite, rT);
3353:         }
3354:         auto cT = CcsrT->column_indices->begin();
3355:         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3356:         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3357:         auto vT = CcsrT->values->begin();
3358:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3359:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3360:         PetscCall(PetscLogGpuTimeEnd());

3362:         PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
3363:         PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
3364:         PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
3365:         PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3366:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3367:         PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3368:         PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3369:         PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
3370:         PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3371:         PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
3372:         Ccusp->matTranspose = CmatT;
3373:       }
3374:     }

3376:     c->free_a = PETSC_TRUE;
3377:     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3378:     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3379:     c->free_ij = PETSC_TRUE;
3380:     PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
3381:     PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
3382:     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3383:     PetscCall(PetscMalloc1(m, &c->ilen));
3384:     PetscCall(PetscMalloc1(m, &c->imax));
3385:     c->maxnz         = c->nz;
3386:     c->nonzerorowcnt = 0;
3387:     c->rmax          = 0;
3388:     for (i = 0; i < m; i++) {
3389:       const PetscInt nn = c->i[i + 1] - c->i[i];
3390:       c->ilen[i] = c->imax[i] = nn;
3391:       c->nonzerorowcnt += (PetscInt)!!nn;
3392:       c->rmax = PetscMax(c->rmax, nn);
3393:     }
3394:     PetscCall(PetscMalloc1(c->nz, &c->a));
3395:     (*C)->nonzerostate++;
3396:     PetscCall(PetscLayoutSetUp((*C)->rmap));
3397:     PetscCall(PetscLayoutSetUp((*C)->cmap));
3398:     Ccusp->nonzerostate = (*C)->nonzerostate;
3399:     (*C)->preallocated  = PETSC_TRUE;
3400:   } else {
3401:     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3402:     c = (Mat_SeqAIJ *)(*C)->data;
3403:     if (c->nz) {
3404:       Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
3405:       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3406:       PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3407:       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3408:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3409:       PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
3410:       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3411:       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3412:       Acsr = (CsrMatrix *)Acusp->mat->mat;
3413:       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3414:       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3415:       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3416:       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3417:       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3418:       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3419:       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3420:       auto pmid = Ccusp->coords->begin();
3421:       thrust::advance(pmid, Acsr->num_entries);
3422:       PetscCall(PetscLogGpuTimeBegin());
3423:       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3424:       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3425:       thrust::for_each(zibait, zieait, VecHIPEquals());
3426:       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3427:       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3428:       thrust::for_each(zibbit, ziebit, VecHIPEquals());
3429:       PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3430:       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3431:         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
3432:         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3433:         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3434:         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3435:         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3436:         auto       vT    = CcsrT->values->begin();
3437:         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3438:         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3439:         (*C)->transupdated = PETSC_TRUE;
3440:       }
3441:       PetscCall(PetscLogGpuTimeEnd());
3442:     }
3443:   }
3444:   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3445:   (*C)->assembled     = PETSC_TRUE;
3446:   (*C)->was_assembled = PETSC_FALSE;
3447:   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
3448:   PetscFunctionReturn(PETSC_SUCCESS);
3449: }

3451: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3452: {
3453:   PetscFunctionBegin;
3454:   PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3455:   PetscFunctionReturn(PETSC_SUCCESS);
3456: }