Actual source code: aijhipsparse.hip.cxx
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #include <thrust/iterator/transform_iterator.h>
17: #if PETSC_CPP_VERSION >= 14
18: #define PETSC_HAVE_THRUST_ASYNC 1
19: #include <thrust/async/for_each.h>
20: #endif
21: #include <thrust/iterator/constant_iterator.h>
22: #include <thrust/iterator/discard_iterator.h>
23: #include <thrust/binary_search.h>
24: #include <thrust/remove.h>
25: #include <thrust/sort.h>
26: #include <thrust/unique.h>
28: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
33: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
44: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
69: /*
70: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71: {
72: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
74: PetscFunctionBegin;
75: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76: hipsparsestruct->stream = stream;
77: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78: PetscFunctionReturn(PETSC_SUCCESS);
79: }
81: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82: {
83: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
85: PetscFunctionBegin;
86: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87: if (hipsparsestruct->handle != handle) {
88: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89: hipsparsestruct->handle = handle;
90: }
91: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96: {
97: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98: PetscBool flg;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
106: */
108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
112: PetscFunctionBegin;
113: switch (op) {
114: case MAT_HIPSPARSE_MULT:
115: hipsparsestruct->format = format;
116: break;
117: case MAT_HIPSPARSE_ALL:
118: hipsparsestruct->format = format;
119: break;
120: default:
121: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122: }
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: /*@
127: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128: operation. Only the `MatMult()` operation can use different GPU storage formats
130: Not Collective
132: Input Parameters:
133: + A - Matrix of type `MATSEQAIJHIPSPARSE`
134: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
138: Level: intermediate
140: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
144: PetscFunctionBegin;
146: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151: {
152: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
154: PetscFunctionBegin;
155: hipsparsestruct->use_cpu_solve = use_cpu;
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: /*@
160: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
162: Input Parameters:
163: + A - Matrix of type `MATSEQAIJHIPSPARSE`
164: - use_cpu - set flag for using the built-in CPU `MatSolve()`
166: Level: intermediate
168: Notes:
169: The hipSparse LU solver currently computes the factors with the built-in CPU method
170: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
173: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
177: PetscFunctionBegin;
179: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184: {
185: PetscFunctionBegin;
186: switch (op) {
187: case MAT_FORM_EXPLICIT_TRANSPOSE:
188: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190: A->form_explicit_transpose = flg;
191: break;
192: default:
193: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194: break;
195: }
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200: {
201: PetscBool row_identity, col_identity;
202: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
203: IS isrow = b->row, iscol = b->col;
204: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
206: PetscFunctionBegin;
207: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209: B->offloadmask = PETSC_OFFLOAD_CPU;
210: /* determine which version of MatSolve needs to be used. */
211: PetscCall(ISIdentity(isrow, &row_identity));
212: PetscCall(ISIdentity(iscol, &col_identity));
213: if (!hipsparsestruct->use_cpu_solve) {
214: if (row_identity && col_identity) {
215: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217: } else {
218: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
219: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220: }
221: }
222: B->ops->matsolve = NULL;
223: B->ops->matsolvetranspose = NULL;
225: /* get the triangular factors */
226: if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
231: {
232: MatHIPSPARSEStorageFormat format;
233: PetscBool flg;
234: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
236: PetscFunctionBegin;
237: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238: if (A->factortype == MAT_FACTOR_NONE) {
239: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245: PetscCall(
246: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249: PetscCall(
250: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252: /*
253: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255: */
256: }
257: PetscOptionsHeadEnd();
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
264: PetscInt n = A->rmap->n;
265: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267: const PetscInt *ai = a->i, *aj = a->j, *vi;
268: const MatScalar *aa = a->a, *v;
269: PetscInt *AiLo, *AjLo;
270: PetscInt i, nz, nzLower, offset, rowOffset;
272: PetscFunctionBegin;
273: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275: try {
276: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277: nzLower = n + ai[n] - ai[1];
278: if (!loTriFactor) {
279: PetscScalar *AALo;
280: PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
282: /* Allocate Space for the lower triangular matrix */
283: PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284: PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
286: /* Fill the lower triangular matrix */
287: AiLo[0] = (PetscInt)0;
288: AiLo[n] = nzLower;
289: AjLo[0] = (PetscInt)0;
290: AALo[0] = (MatScalar)1.0;
291: v = aa;
292: vi = aj;
293: offset = 1;
294: rowOffset = 1;
295: for (i = 1; i < n; i++) {
296: nz = ai[i + 1] - ai[i];
297: /* additional 1 for the term on the diagonal */
298: AiLo[i] = rowOffset;
299: rowOffset += nz + 1;
301: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303: offset += nz;
304: AjLo[offset] = (PetscInt)i;
305: AALo[offset] = (MatScalar)1.0;
306: offset += 1;
307: v += nz;
308: vi += nz;
309: }
311: /* allocate space for the triangular factor information */
312: PetscCall(PetscNew(&loTriFactor));
313: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314: /* Create the matrix description */
315: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
321: /* set the operation */
322: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
324: /* set the matrix */
325: loTriFactor->csrMat = new CsrMatrix;
326: loTriFactor->csrMat->num_rows = n;
327: loTriFactor->csrMat->num_cols = n;
328: loTriFactor->csrMat->num_entries = nzLower;
329: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
330: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
333: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
337: /* Create the solve analysis information */
338: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
344: /* perform the solve analysis */
345: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
348: PetscCallHIP(WaitForHIP());
349: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
351: /* assign the pointer */
352: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353: loTriFactor->AA_h = AALo;
354: PetscCallHIP(hipHostFree(AiLo));
355: PetscCallHIP(hipHostFree(AjLo));
356: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357: } else { /* update values only */
358: if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359: /* Fill the lower triangular matrix */
360: loTriFactor->AA_h[0] = 1.0;
361: v = aa;
362: vi = aj;
363: offset = 1;
364: for (i = 1; i < n; i++) {
365: nz = ai[i + 1] - ai[i];
366: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367: offset += nz;
368: loTriFactor->AA_h[offset] = 1.0;
369: offset += 1;
370: v += nz;
371: }
372: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374: }
375: } catch (char *ex) {
376: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377: }
378: }
379: PetscFunctionReturn(PETSC_SUCCESS);
380: }
382: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383: {
384: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
385: PetscInt n = A->rmap->n;
386: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388: const PetscInt *aj = a->j, *adiag, *vi;
389: const MatScalar *aa = a->a, *v;
390: PetscInt *AiUp, *AjUp;
391: PetscInt i, nz, nzUpper, offset;
393: PetscFunctionBegin;
394: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
396: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
397: try {
398: /* next, figure out the number of nonzeros in the upper triangular matrix. */
399: nzUpper = adiag[0] - adiag[n];
400: if (!upTriFactor) {
401: PetscScalar *AAUp;
402: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
404: /* Allocate Space for the upper triangular matrix */
405: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
406: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
408: /* Fill the upper triangular matrix */
409: AiUp[0] = (PetscInt)0;
410: AiUp[n] = nzUpper;
411: offset = nzUpper;
412: for (i = n - 1; i >= 0; i--) {
413: v = aa + adiag[i + 1] + 1;
414: vi = aj + adiag[i + 1] + 1;
415: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
416: offset -= (nz + 1); /* decrement the offset */
418: /* first, set the diagonal elements */
419: AjUp[offset] = (PetscInt)i;
420: AAUp[offset] = (MatScalar)1. / v[nz];
421: AiUp[i] = AiUp[i + 1] - (nz + 1);
423: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
424: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
425: }
427: /* allocate space for the triangular factor information */
428: PetscCall(PetscNew(&upTriFactor));
429: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
431: /* Create the matrix description */
432: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
433: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
434: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
435: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
436: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
438: /* set the operation */
439: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
441: /* set the matrix */
442: upTriFactor->csrMat = new CsrMatrix;
443: upTriFactor->csrMat->num_rows = n;
444: upTriFactor->csrMat->num_cols = n;
445: upTriFactor->csrMat->num_entries = nzUpper;
446: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
447: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
449: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
450: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
451: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
453: /* Create the solve analysis information */
454: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
455: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
456: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
457: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
458: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
460: /* perform the solve analysis */
461: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
462: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
464: PetscCallHIP(WaitForHIP());
465: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
467: /* assign the pointer */
468: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
469: upTriFactor->AA_h = AAUp;
470: PetscCallHIP(hipHostFree(AiUp));
471: PetscCallHIP(hipHostFree(AjUp));
472: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
473: } else {
474: if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
475: /* Fill the upper triangular matrix */
476: offset = nzUpper;
477: for (i = n - 1; i >= 0; i--) {
478: v = aa + adiag[i + 1] + 1;
479: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
480: offset -= (nz + 1); /* decrement the offset */
482: /* first, set the diagonal elements */
483: upTriFactor->AA_h[offset] = 1. / v[nz];
484: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
485: }
486: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
487: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
488: }
489: } catch (char *ex) {
490: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
491: }
492: }
493: PetscFunctionReturn(PETSC_SUCCESS);
494: }
496: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
497: {
498: PetscBool row_identity, col_identity;
499: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
500: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
501: IS isrow = a->row, iscol = a->icol;
502: PetscInt n = A->rmap->n;
504: PetscFunctionBegin;
505: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
506: PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
507: PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
509: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
510: hipsparseTriFactors->nnz = a->nz;
512: A->offloadmask = PETSC_OFFLOAD_BOTH;
513: /* lower triangular indices */
514: PetscCall(ISIdentity(isrow, &row_identity));
515: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
516: const PetscInt *r;
518: PetscCall(ISGetIndices(isrow, &r));
519: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
520: hipsparseTriFactors->rpermIndices->assign(r, r + n);
521: PetscCall(ISRestoreIndices(isrow, &r));
522: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
523: }
524: /* upper triangular indices */
525: PetscCall(ISIdentity(iscol, &col_identity));
526: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
527: const PetscInt *c;
529: PetscCall(ISGetIndices(iscol, &c));
530: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
531: hipsparseTriFactors->cpermIndices->assign(c, c + n);
532: PetscCall(ISRestoreIndices(iscol, &c));
533: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
534: }
535: PetscFunctionReturn(PETSC_SUCCESS);
536: }
538: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
539: {
540: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
541: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
542: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
543: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
544: PetscInt *AiUp, *AjUp;
545: PetscScalar *AAUp;
546: PetscScalar *AALo;
547: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
548: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
549: const PetscInt *ai = b->i, *aj = b->j, *vj;
550: const MatScalar *aa = b->a, *v;
552: PetscFunctionBegin;
553: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
554: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
555: try {
556: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
557: PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
558: if (!upTriFactor && !loTriFactor) {
559: /* Allocate Space for the upper triangular matrix */
560: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
561: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
563: /* Fill the upper triangular matrix */
564: AiUp[0] = (PetscInt)0;
565: AiUp[n] = nzUpper;
566: offset = 0;
567: for (i = 0; i < n; i++) {
568: /* set the pointers */
569: v = aa + ai[i];
570: vj = aj + ai[i];
571: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
573: /* first, set the diagonal elements */
574: AjUp[offset] = (PetscInt)i;
575: AAUp[offset] = (MatScalar)1.0 / v[nz];
576: AiUp[i] = offset;
577: AALo[offset] = (MatScalar)1.0 / v[nz];
579: offset += 1;
580: if (nz > 0) {
581: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
582: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
583: for (j = offset; j < offset + nz; j++) {
584: AAUp[j] = -AAUp[j];
585: AALo[j] = AAUp[j] / v[nz];
586: }
587: offset += nz;
588: }
589: }
591: /* allocate space for the triangular factor information */
592: PetscCall(PetscNew(&upTriFactor));
593: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
595: /* Create the matrix description */
596: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
597: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
598: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
599: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
600: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
602: /* set the matrix */
603: upTriFactor->csrMat = new CsrMatrix;
604: upTriFactor->csrMat->num_rows = A->rmap->n;
605: upTriFactor->csrMat->num_cols = A->cmap->n;
606: upTriFactor->csrMat->num_entries = a->nz;
607: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
608: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
610: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
611: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
612: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
614: /* set the operation */
615: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
617: /* Create the solve analysis information */
618: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
619: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
620: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
621: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
622: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
624: /* perform the solve analysis */
625: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
626: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
628: PetscCallHIP(WaitForHIP());
629: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
631: /* assign the pointer */
632: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
634: /* allocate space for the triangular factor information */
635: PetscCall(PetscNew(&loTriFactor));
636: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
638: /* Create the matrix description */
639: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
640: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
641: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
642: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
643: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
645: /* set the operation */
646: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
648: /* set the matrix */
649: loTriFactor->csrMat = new CsrMatrix;
650: loTriFactor->csrMat->num_rows = A->rmap->n;
651: loTriFactor->csrMat->num_cols = A->cmap->n;
652: loTriFactor->csrMat->num_entries = a->nz;
653: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
654: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
655: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
656: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
657: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
658: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
660: /* Create the solve analysis information */
661: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
662: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
663: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
664: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
665: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
667: /* perform the solve analysis */
668: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
669: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
671: PetscCallHIP(WaitForHIP());
672: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
674: /* assign the pointer */
675: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
677: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
678: PetscCallHIP(hipHostFree(AiUp));
679: PetscCallHIP(hipHostFree(AjUp));
680: } else {
681: /* Fill the upper triangular matrix */
682: offset = 0;
683: for (i = 0; i < n; i++) {
684: /* set the pointers */
685: v = aa + ai[i];
686: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
688: /* first, set the diagonal elements */
689: AAUp[offset] = 1.0 / v[nz];
690: AALo[offset] = 1.0 / v[nz];
692: offset += 1;
693: if (nz > 0) {
694: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
695: for (j = offset; j < offset + nz; j++) {
696: AAUp[j] = -AAUp[j];
697: AALo[j] = AAUp[j] / v[nz];
698: }
699: offset += nz;
700: }
701: }
702: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
705: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
707: }
708: PetscCallHIP(hipHostFree(AAUp));
709: PetscCallHIP(hipHostFree(AALo));
710: } catch (char *ex) {
711: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
712: }
713: }
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
718: {
719: PetscBool perm_identity;
720: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
721: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
722: IS ip = a->row;
723: PetscInt n = A->rmap->n;
725: PetscFunctionBegin;
726: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
727: PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
728: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
729: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
731: A->offloadmask = PETSC_OFFLOAD_BOTH;
732: /* lower triangular indices */
733: PetscCall(ISIdentity(ip, &perm_identity));
734: if (!perm_identity) {
735: IS iip;
736: const PetscInt *irip, *rip;
738: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
739: PetscCall(ISGetIndices(iip, &irip));
740: PetscCall(ISGetIndices(ip, &rip));
741: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
742: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
743: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
744: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
745: PetscCall(ISRestoreIndices(iip, &irip));
746: PetscCall(ISDestroy(&iip));
747: PetscCall(ISRestoreIndices(ip, &rip));
748: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
749: }
750: PetscFunctionReturn(PETSC_SUCCESS);
751: }
753: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
754: {
755: PetscBool perm_identity;
756: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
757: IS ip = b->row;
759: PetscFunctionBegin;
760: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
761: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
762: B->offloadmask = PETSC_OFFLOAD_CPU;
763: /* determine which version of MatSolve needs to be used. */
764: PetscCall(ISIdentity(ip, &perm_identity));
765: if (perm_identity) {
766: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
767: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
768: B->ops->matsolve = NULL;
769: B->ops->matsolvetranspose = NULL;
770: } else {
771: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
772: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
773: B->ops->matsolve = NULL;
774: B->ops->matsolvetranspose = NULL;
775: }
777: /* get the triangular factors */
778: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
779: PetscFunctionReturn(PETSC_SUCCESS);
780: }
782: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
783: {
784: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
785: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
786: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
787: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
788: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
789: hipsparseIndexBase_t indexBase;
790: hipsparseMatrixType_t matrixType;
791: hipsparseFillMode_t fillMode;
792: hipsparseDiagType_t diagType;
794: PetscFunctionBegin;
795: /* allocate space for the transpose of the lower triangular factor */
796: PetscCall(PetscNew(&loTriFactorT));
797: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
799: /* set the matrix descriptors of the lower triangular factor */
800: matrixType = hipsparseGetMatType(loTriFactor->descr);
801: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
802: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
803: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
805: /* Create the matrix description */
806: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
807: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
808: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
809: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
810: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
812: /* set the operation */
813: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
815: /* allocate GPU space for the CSC of the lower triangular factor*/
816: loTriFactorT->csrMat = new CsrMatrix;
817: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
818: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
819: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
820: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
821: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
822: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
824: /* compute the transpose of the lower triangular factor, i.e. the CSC */
825: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
826: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
827: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
828: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
829: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
830: PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
831: #endif
832: */
833: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
835: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
837: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
838: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
839: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
840: #else
841: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
842: #endif
844: PetscCallHIP(WaitForHIP());
845: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
847: /* Create the solve analysis information */
848: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
849: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
850: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
851: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
852: PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
854: /* perform the solve analysis */
855: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
856: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
858: PetscCallHIP(WaitForHIP());
859: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
861: /* assign the pointer */
862: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
864: /*********************************************/
865: /* Now the Transpose of the Upper Tri Factor */
866: /*********************************************/
868: /* allocate space for the transpose of the upper triangular factor */
869: PetscCall(PetscNew(&upTriFactorT));
870: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
872: /* set the matrix descriptors of the upper triangular factor */
873: matrixType = hipsparseGetMatType(upTriFactor->descr);
874: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
875: fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
876: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
878: /* Create the matrix description */
879: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
880: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
881: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
882: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
883: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
885: /* set the operation */
886: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
888: /* allocate GPU space for the CSC of the upper triangular factor*/
889: upTriFactorT->csrMat = new CsrMatrix;
890: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
891: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
892: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
893: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
894: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
895: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
897: /* compute the transpose of the upper triangular factor, i.e. the CSC */
898: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
899: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
900: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
901: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
902: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
903: PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
904: #endif
905: */
906: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
907: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
908: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
909: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
910: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
911: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
912: #else
913: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
914: #endif
916: PetscCallHIP(WaitForHIP());
917: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
919: /* Create the solve analysis information */
920: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
921: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
922: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
923: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
924: PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
926: /* perform the solve analysis */
927: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
928: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
930: PetscCallHIP(WaitForHIP());
931: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
933: /* assign the pointer */
934: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
935: PetscFunctionReturn(PETSC_SUCCESS);
936: }
938: struct PetscScalarToPetscInt {
939: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
940: };
942: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
943: {
944: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
945: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
946: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
947: hipsparseIndexBase_t indexBase;
949: PetscFunctionBegin;
950: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
951: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
952: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
953: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
954: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
955: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
956: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
957: PetscCall(PetscLogGpuTimeBegin());
958: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
959: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
960: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
961: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
962: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
963: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
964: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
966: /* set alpha and beta */
967: PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
968: PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
969: PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
970: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
971: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
972: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
974: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
975: CsrMatrix *matrixT = new CsrMatrix;
976: matstructT->mat = matrixT;
977: matrixT->num_rows = A->cmap->n;
978: matrixT->num_cols = A->rmap->n;
979: matrixT->num_entries = a->nz;
980: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
981: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
982: matrixT->values = new THRUSTARRAY(a->nz);
984: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
985: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
987: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
988: indexBase, hipsparse_scalartype));
989: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
990: CsrMatrix *temp = new CsrMatrix;
991: CsrMatrix *tempT = new CsrMatrix;
992: /* First convert HYB to CSR */
993: temp->num_rows = A->rmap->n;
994: temp->num_cols = A->cmap->n;
995: temp->num_entries = a->nz;
996: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
997: temp->column_indices = new THRUSTINTARRAY32(a->nz);
998: temp->values = new THRUSTARRAY(a->nz);
1000: PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1002: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1003: tempT->num_rows = A->rmap->n;
1004: tempT->num_cols = A->cmap->n;
1005: tempT->num_entries = a->nz;
1006: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1007: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1008: tempT->values = new THRUSTARRAY(a->nz);
1010: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1011: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1013: /* Last, convert CSC to HYB */
1014: hipsparseHybMat_t hybMat;
1015: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1016: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1017: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1019: /* assign the pointer */
1020: matstructT->mat = hybMat;
1021: A->transupdated = PETSC_TRUE;
1022: /* delete temporaries */
1023: if (tempT) {
1024: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1025: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1026: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1027: delete (CsrMatrix *)tempT;
1028: }
1029: if (temp) {
1030: if (temp->values) delete (THRUSTARRAY *)temp->values;
1031: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1032: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1033: delete (CsrMatrix *)temp;
1034: }
1035: }
1036: }
1037: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1038: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1039: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1040: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1041: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1042: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1043: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1044: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1045: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1046: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1047: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1048: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1049: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1050: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1051: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1052: }
1053: if (!hipsparsestruct->csr2csc_i) {
1054: THRUSTARRAY csr2csc_a(matrix->num_entries);
1055: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1057: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1058: if (matrix->num_entries) {
1059: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1060: Need to verify this for ROCm.
1061: */
1062: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1063: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1064: } else {
1065: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1066: }
1068: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1069: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1070: }
1071: PetscCallThrust(
1072: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1073: }
1074: PetscCall(PetscLogGpuTimeEnd());
1075: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1076: /* the compressed row indices is not used for matTranspose */
1077: matstructT->cprowIndices = NULL;
1078: /* assign the pointer */
1079: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1080: A->transupdated = PETSC_TRUE;
1081: PetscFunctionReturn(PETSC_SUCCESS);
1082: }
1084: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1085: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1086: {
1087: PetscInt n = xx->map->n;
1088: const PetscScalar *barray;
1089: PetscScalar *xarray;
1090: thrust::device_ptr<const PetscScalar> bGPU;
1091: thrust::device_ptr<PetscScalar> xGPU;
1092: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1093: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1094: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1095: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1097: PetscFunctionBegin;
1098: /* Analyze the matrix and create the transpose ... on the fly */
1099: if (!loTriFactorT && !upTriFactorT) {
1100: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1101: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1102: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1103: }
1105: /* Get the GPU pointers */
1106: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1107: PetscCall(VecHIPGetArrayRead(bb, &barray));
1108: xGPU = thrust::device_pointer_cast(xarray);
1109: bGPU = thrust::device_pointer_cast(barray);
1111: PetscCall(PetscLogGpuTimeBegin());
1112: /* First, reorder with the row permutation */
1113: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1115: /* First, solve U */
1116: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1117: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1119: /* Then, solve L */
1120: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1123: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1124: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1126: /* Copy the temporary to the full solution. */
1127: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1129: /* restore */
1130: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1131: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1132: PetscCall(PetscLogGpuTimeEnd());
1133: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1134: PetscFunctionReturn(PETSC_SUCCESS);
1135: }
1137: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1138: {
1139: const PetscScalar *barray;
1140: PetscScalar *xarray;
1141: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1142: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1143: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1144: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1146: PetscFunctionBegin;
1147: /* Analyze the matrix and create the transpose ... on the fly */
1148: if (!loTriFactorT && !upTriFactorT) {
1149: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1150: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1151: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1152: }
1154: /* Get the GPU pointers */
1155: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1156: PetscCall(VecHIPGetArrayRead(bb, &barray));
1158: PetscCall(PetscLogGpuTimeBegin());
1159: /* First, solve U */
1160: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1161: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1163: /* Then, solve L */
1164: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1165: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1167: /* restore */
1168: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1169: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1170: PetscCall(PetscLogGpuTimeEnd());
1171: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1172: PetscFunctionReturn(PETSC_SUCCESS);
1173: }
1175: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1176: {
1177: const PetscScalar *barray;
1178: PetscScalar *xarray;
1179: thrust::device_ptr<const PetscScalar> bGPU;
1180: thrust::device_ptr<PetscScalar> xGPU;
1181: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1182: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1183: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1184: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1186: PetscFunctionBegin;
1187: /* Get the GPU pointers */
1188: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1189: PetscCall(VecHIPGetArrayRead(bb, &barray));
1190: xGPU = thrust::device_pointer_cast(xarray);
1191: bGPU = thrust::device_pointer_cast(barray);
1193: PetscCall(PetscLogGpuTimeBegin());
1194: /* First, reorder with the row permutation */
1195: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1197: /* Next, solve L */
1198: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1199: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1201: /* Then, solve U */
1202: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1203: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1205: /* Last, reorder with the column permutation */
1206: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1208: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1209: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1210: PetscCall(PetscLogGpuTimeEnd());
1211: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1212: PetscFunctionReturn(PETSC_SUCCESS);
1213: }
1215: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1216: {
1217: const PetscScalar *barray;
1218: PetscScalar *xarray;
1219: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1220: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1221: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1222: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1224: PetscFunctionBegin;
1225: /* Get the GPU pointers */
1226: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1227: PetscCall(VecHIPGetArrayRead(bb, &barray));
1229: PetscCall(PetscLogGpuTimeBegin());
1230: /* First, solve L */
1231: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1232: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1234: /* Next, solve U */
1235: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1236: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1238: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1239: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1240: PetscCall(PetscLogGpuTimeEnd());
1241: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1242: PetscFunctionReturn(PETSC_SUCCESS);
1243: }
1245: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1246: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1247: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1248: {
1249: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1250: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1251: const PetscScalar *barray;
1252: PetscScalar *xarray;
1254: PetscFunctionBegin;
1255: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1256: PetscCall(VecHIPGetArrayRead(b, &barray));
1257: PetscCall(PetscLogGpuTimeBegin());
1259: /* Solve L*y = b */
1260: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1261: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1262: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1263: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1264: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1265: #else
1266: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1267: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1268: #endif
1269: /* Solve U*x = y */
1270: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1271: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1272: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1273: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1274: #else
1275: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1276: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1277: #endif
1278: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1279: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1281: PetscCall(PetscLogGpuTimeEnd());
1282: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1283: PetscFunctionReturn(PETSC_SUCCESS);
1284: }
1286: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1287: {
1288: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1289: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1290: const PetscScalar *barray;
1291: PetscScalar *xarray;
1293: PetscFunctionBegin;
1294: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1295: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1296: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1297: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1299: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1300: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1301: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1302: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1303: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1304: }
1306: if (!fs->updatedTransposeSpSVAnalysis) {
1307: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1309: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1310: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1311: }
1313: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1314: PetscCall(VecHIPGetArrayRead(b, &barray));
1315: PetscCall(PetscLogGpuTimeBegin());
1317: /* Solve Ut*y = b */
1318: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1319: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1320: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1321: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1322: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1323: #else
1324: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1325: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1326: #endif
1327: /* Solve Lt*x = y */
1328: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1329: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1330: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1331: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1332: #else
1333: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1334: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1335: #endif
1336: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1337: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1338: PetscCall(PetscLogGpuTimeEnd());
1339: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1340: PetscFunctionReturn(PETSC_SUCCESS);
1341: }
1343: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1344: {
1345: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1346: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1347: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1348: CsrMatrix *Acsr;
1349: PetscInt m, nz;
1350: PetscBool flg;
1352: PetscFunctionBegin;
1353: if (PetscDefined(USE_DEBUG)) {
1354: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1355: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1356: }
1358: /* Copy A's value to fact */
1359: m = fact->rmap->n;
1360: nz = aij->nz;
1361: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1362: Acsr = (CsrMatrix *)Acusp->mat->mat;
1363: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1365: /* Factorize fact inplace */
1366: if (m)
1367: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1368: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1369: if (PetscDefined(USE_DEBUG)) {
1370: int numerical_zero;
1371: hipsparseStatus_t status;
1372: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1373: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1374: }
1376: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1377: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1379: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1381: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1382: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1384: fact->offloadmask = PETSC_OFFLOAD_GPU;
1385: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1386: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1387: fact->ops->matsolve = NULL;
1388: fact->ops->matsolvetranspose = NULL;
1389: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1390: PetscFunctionReturn(PETSC_SUCCESS);
1391: }
1393: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1394: {
1395: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1396: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1397: PetscInt m, nz;
1399: PetscFunctionBegin;
1400: if (PetscDefined(USE_DEBUG)) {
1401: PetscBool flg, diagDense;
1403: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1407: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1408: }
1410: /* Free the old stale stuff */
1411: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1413: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414: but they will not be used. Allocate them just for easy debugging.
1415: */
1416: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1418: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1419: fact->factortype = MAT_FACTOR_ILU;
1420: fact->info.factor_mallocs = 0;
1421: fact->info.fill_ratio_given = info->fill;
1422: fact->info.fill_ratio_needed = 1.0;
1424: aij->row = NULL;
1425: aij->col = NULL;
1427: /* ====================================================================== */
1428: /* Copy A's i, j to fact and also allocate the value array of fact. */
1429: /* We'll do in-place factorization on fact */
1430: /* ====================================================================== */
1431: const int *Ai, *Aj;
1433: m = fact->rmap->n;
1434: nz = aij->nz;
1436: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1443: /* ====================================================================== */
1444: /* Create descriptors for M, L, U */
1445: /* ====================================================================== */
1446: hipsparseFillMode_t fillMode;
1447: hipsparseDiagType_t diagType;
1449: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1453: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458: */
1459: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1465: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1471: /* ========================================================================= */
1472: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1473: /* ========================================================================= */
1474: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475: if (m)
1476: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1479: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1482: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1485: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1488: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1491: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493: */
1494: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496: fs->spsvBuffer_L = fs->factBuffer_M;
1497: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498: } else {
1499: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500: fs->spsvBuffer_U = fs->factBuffer_M;
1501: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502: }
1504: /* ========================================================================== */
1505: /* Perform analysis of ilu0 on M, SpSv on L and U */
1506: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507: /* ========================================================================== */
1508: int structural_zero;
1510: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511: if (m)
1512: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514: if (PetscDefined(USE_DEBUG)) {
1515: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516: hipsparseStatus_t status;
1517: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519: }
1521: /* Estimate FLOPs of the numeric factorization */
1522: {
1523: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1524: PetscInt *Ai, nzRow, nzLeft;
1525: PetscLogDouble flops = 0.0;
1526: const PetscInt *Adiag;
1528: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1529: Ai = Aseq->i;
1530: for (PetscInt i = 0; i < m; i++) {
1531: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532: nzRow = Ai[i + 1] - Ai[i];
1533: nzLeft = Adiag[i] - Ai[i];
1534: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536: */
1537: nzLeft = (nzRow - 1) / 2;
1538: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539: }
1540: }
1541: fs->numericFactFlops = flops;
1542: }
1543: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544: PetscFunctionReturn(PETSC_SUCCESS);
1545: }
1547: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548: {
1549: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1551: const PetscScalar *barray;
1552: PetscScalar *xarray;
1554: PetscFunctionBegin;
1555: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556: PetscCall(VecHIPGetArrayRead(b, &barray));
1557: PetscCall(PetscLogGpuTimeBegin());
1559: /* Solve L*y = b */
1560: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1563: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565: #else
1566: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568: #endif
1569: /* Solve Lt*x = y */
1570: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1572: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574: #else
1575: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577: #endif
1578: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1581: PetscCall(PetscLogGpuTimeEnd());
1582: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587: {
1588: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1590: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591: CsrMatrix *Acsr;
1592: PetscInt m, nz;
1593: PetscBool flg;
1595: PetscFunctionBegin;
1596: if (PetscDefined(USE_DEBUG)) {
1597: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599: }
1601: /* Copy A's value to fact */
1602: m = fact->rmap->n;
1603: nz = aij->nz;
1604: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605: Acsr = (CsrMatrix *)Acusp->mat->mat;
1606: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1608: /* Factorize fact inplace */
1609: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613: */
1614: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615: if (PetscDefined(USE_DEBUG)) {
1616: int numerical_zero;
1617: hipsparseStatus_t status;
1618: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620: }
1622: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1624: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626: */
1627: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1629: fact->offloadmask = PETSC_OFFLOAD_GPU;
1630: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632: fact->ops->matsolve = NULL;
1633: fact->ops->matsolvetranspose = NULL;
1634: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635: PetscFunctionReturn(PETSC_SUCCESS);
1636: }
1638: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639: {
1640: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1642: PetscInt m, nz;
1644: PetscFunctionBegin;
1645: if (PetscDefined(USE_DEBUG)) {
1646: PetscBool flg, diagDense;
1648: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1649: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1650: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1651: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1652: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1653: }
1655: /* Free the old stale stuff */
1656: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1658: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1659: but they will not be used. Allocate them just for easy debugging.
1660: */
1661: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1663: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1664: fact->factortype = MAT_FACTOR_ICC;
1665: fact->info.factor_mallocs = 0;
1666: fact->info.fill_ratio_given = info->fill;
1667: fact->info.fill_ratio_needed = 1.0;
1669: aij->row = NULL;
1670: aij->col = NULL;
1672: /* ====================================================================== */
1673: /* Copy A's i, j to fact and also allocate the value array of fact. */
1674: /* We'll do in-place factorization on fact */
1675: /* ====================================================================== */
1676: const int *Ai, *Aj;
1678: m = fact->rmap->n;
1679: nz = aij->nz;
1681: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1682: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1683: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1684: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1685: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1686: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1688: /* ====================================================================== */
1689: /* Create mat descriptors for M, L */
1690: /* ====================================================================== */
1691: hipsparseFillMode_t fillMode;
1692: hipsparseDiagType_t diagType;
1694: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1695: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1696: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1698: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1699: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1700: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1701: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1702: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1703: */
1704: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1705: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1706: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1707: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1708: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1710: /* ========================================================================= */
1711: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1712: /* ========================================================================= */
1713: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1714: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1716: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1717: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1719: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1720: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1722: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1723: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1725: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1726: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1728: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1729: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1730: */
1731: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1732: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1733: fs->spsvBuffer_L = fs->factBuffer_M;
1734: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1735: } else {
1736: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1737: fs->spsvBuffer_Lt = fs->factBuffer_M;
1738: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1739: }
1741: /* ========================================================================== */
1742: /* Perform analysis of ic0 on M */
1743: /* The lower triangular part of M has the same sparsity pattern as L */
1744: /* ========================================================================== */
1745: int structural_zero;
1747: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1748: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1749: if (PetscDefined(USE_DEBUG)) {
1750: hipsparseStatus_t status;
1751: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1752: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1753: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1754: }
1756: /* Estimate FLOPs of the numeric factorization */
1757: {
1758: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1759: PetscInt *Ai, nzRow, nzLeft;
1760: PetscLogDouble flops = 0.0;
1762: Ai = Aseq->i;
1763: for (PetscInt i = 0; i < m; i++) {
1764: nzRow = Ai[i + 1] - Ai[i];
1765: if (nzRow > 1) {
1766: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1767: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1768: */
1769: nzLeft = (nzRow - 1) / 2;
1770: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1771: }
1772: }
1773: fs->numericFactFlops = flops;
1774: }
1775: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1776: PetscFunctionReturn(PETSC_SUCCESS);
1777: }
1778: #endif
1780: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1781: {
1782: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1784: PetscFunctionBegin;
1785: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1786: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1787: if (!info->factoronhost) {
1788: PetscCall(ISIdentity(isrow, &row_identity));
1789: PetscCall(ISIdentity(iscol, &col_identity));
1790: }
1791: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1792: else
1793: #endif
1794: {
1795: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1796: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1797: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1798: }
1799: PetscFunctionReturn(PETSC_SUCCESS);
1800: }
1802: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1803: {
1804: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1806: PetscFunctionBegin;
1807: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1808: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1809: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1810: PetscFunctionReturn(PETSC_SUCCESS);
1811: }
1813: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1814: {
1815: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1817: PetscFunctionBegin;
1818: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1819: PetscBool perm_identity = PETSC_FALSE;
1820: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1821: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1822: else
1823: #endif
1824: {
1825: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1826: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1827: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1828: }
1829: PetscFunctionReturn(PETSC_SUCCESS);
1830: }
1832: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1833: {
1834: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1836: PetscFunctionBegin;
1837: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1838: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1839: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1840: PetscFunctionReturn(PETSC_SUCCESS);
1841: }
1843: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1844: {
1845: PetscFunctionBegin;
1846: *type = MATSOLVERHIPSPARSE;
1847: PetscFunctionReturn(PETSC_SUCCESS);
1848: }
1850: /*MC
1851: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1852: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1853: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1854: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1855: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1856: algorithms are not recommended. This class does NOT support direct solver operations.
1858: Level: beginner
1860: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1861: M*/
1863: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1864: {
1865: PetscInt n = A->rmap->n;
1867: PetscFunctionBegin;
1868: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1869: PetscCall(MatSetSizes(*B, n, n, n, n));
1870: (*B)->factortype = ftype;
1871: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1873: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1874: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1875: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1876: if (!A->boundtocpu) {
1877: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1878: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1879: } else {
1880: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1881: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1882: }
1883: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1884: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1885: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1886: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1887: if (!A->boundtocpu) {
1888: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1889: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1890: } else {
1891: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1892: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1893: }
1894: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1895: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1896: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1898: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1899: (*B)->canuseordering = PETSC_TRUE;
1900: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1901: PetscFunctionReturn(PETSC_SUCCESS);
1902: }
1904: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1905: {
1906: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1907: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1908: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1909: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1910: #endif
1912: PetscFunctionBegin;
1913: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1914: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1915: if (A->factortype == MAT_FACTOR_NONE) {
1916: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1917: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1918: }
1919: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1920: else if (fs->csrVal) {
1921: /* We have a factorized matrix on device and are able to copy it to host */
1922: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1923: }
1924: #endif
1925: else
1926: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1927: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1928: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1929: A->offloadmask = PETSC_OFFLOAD_BOTH;
1930: }
1931: PetscFunctionReturn(PETSC_SUCCESS);
1932: }
1934: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1935: {
1936: PetscFunctionBegin;
1937: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1938: *array = ((Mat_SeqAIJ *)A->data)->a;
1939: PetscFunctionReturn(PETSC_SUCCESS);
1940: }
1942: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1943: {
1944: PetscFunctionBegin;
1945: A->offloadmask = PETSC_OFFLOAD_CPU;
1946: *array = NULL;
1947: PetscFunctionReturn(PETSC_SUCCESS);
1948: }
1950: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1951: {
1952: PetscFunctionBegin;
1953: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1954: *array = ((Mat_SeqAIJ *)A->data)->a;
1955: PetscFunctionReturn(PETSC_SUCCESS);
1956: }
1958: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1959: {
1960: PetscFunctionBegin;
1961: *array = NULL;
1962: PetscFunctionReturn(PETSC_SUCCESS);
1963: }
1965: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1966: {
1967: PetscFunctionBegin;
1968: *array = ((Mat_SeqAIJ *)A->data)->a;
1969: PetscFunctionReturn(PETSC_SUCCESS);
1970: }
1972: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1973: {
1974: PetscFunctionBegin;
1975: A->offloadmask = PETSC_OFFLOAD_CPU;
1976: *array = NULL;
1977: PetscFunctionReturn(PETSC_SUCCESS);
1978: }
1980: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1981: {
1982: Mat_SeqAIJHIPSPARSE *cusp;
1983: CsrMatrix *matrix;
1985: PetscFunctionBegin;
1986: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1987: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1988: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1989: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1990: matrix = (CsrMatrix *)cusp->mat->mat;
1992: if (i) {
1993: #if !defined(PETSC_USE_64BIT_INDICES)
1994: *i = matrix->row_offsets->data().get();
1995: #else
1996: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1997: #endif
1998: }
1999: if (j) {
2000: #if !defined(PETSC_USE_64BIT_INDICES)
2001: *j = matrix->column_indices->data().get();
2002: #else
2003: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2004: #endif
2005: }
2006: if (a) *a = matrix->values->data().get();
2007: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2008: PetscFunctionReturn(PETSC_SUCCESS);
2009: }
2011: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2012: {
2013: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2014: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2015: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2016: PetscBool both = PETSC_TRUE;
2017: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2019: PetscFunctionBegin;
2020: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2021: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2022: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2023: CsrMatrix *matrix;
2024: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2026: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2027: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2028: matrix->values->assign(a->a, a->a + a->nz);
2029: PetscCallHIP(WaitForHIP());
2030: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2031: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2032: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2033: } else {
2034: PetscInt nnz;
2035: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2036: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2037: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2038: delete hipsparsestruct->workVector;
2039: delete hipsparsestruct->rowoffsets_gpu;
2040: hipsparsestruct->workVector = NULL;
2041: hipsparsestruct->rowoffsets_gpu = NULL;
2042: try {
2043: if (a->compressedrow.use) {
2044: m = a->compressedrow.nrows;
2045: ii = a->compressedrow.i;
2046: ridx = a->compressedrow.rindex;
2047: } else {
2048: m = A->rmap->n;
2049: ii = a->i;
2050: ridx = NULL;
2051: }
2052: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2053: if (!a->a) {
2054: nnz = ii[m];
2055: both = PETSC_FALSE;
2056: } else nnz = a->nz;
2057: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2059: /* create hipsparse matrix */
2060: hipsparsestruct->nrows = m;
2061: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2062: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2063: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2064: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2066: PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2067: PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2068: PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2069: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2070: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2071: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2072: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2074: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2075: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2076: /* set the matrix */
2077: CsrMatrix *mat = new CsrMatrix;
2078: mat->num_rows = m;
2079: mat->num_cols = A->cmap->n;
2080: mat->num_entries = nnz;
2081: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2082: mat->column_indices = new THRUSTINTARRAY32(nnz);
2083: mat->values = new THRUSTARRAY(nnz);
2084: mat->row_offsets->assign(ii, ii + m + 1);
2085: mat->column_indices->assign(a->j, a->j + nnz);
2086: if (a->a) mat->values->assign(a->a, a->a + nnz);
2088: /* assign the pointer */
2089: matstruct->mat = mat;
2090: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2091: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2092: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2093: }
2094: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2095: CsrMatrix *mat = new CsrMatrix;
2096: mat->num_rows = m;
2097: mat->num_cols = A->cmap->n;
2098: mat->num_entries = nnz;
2099: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2100: mat->column_indices = new THRUSTINTARRAY32(nnz);
2101: mat->values = new THRUSTARRAY(nnz);
2102: mat->row_offsets->assign(ii, ii + m + 1);
2103: mat->column_indices->assign(a->j, a->j + nnz);
2104: if (a->a) mat->values->assign(a->a, a->a + nnz);
2106: hipsparseHybMat_t hybMat;
2107: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2108: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2109: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2110: /* assign the pointer */
2111: matstruct->mat = hybMat;
2113: if (mat) {
2114: if (mat->values) delete (THRUSTARRAY *)mat->values;
2115: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2116: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2117: delete (CsrMatrix *)mat;
2118: }
2119: }
2121: /* assign the compressed row indices */
2122: if (a->compressedrow.use) {
2123: hipsparsestruct->workVector = new THRUSTARRAY(m);
2124: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2125: matstruct->cprowIndices->assign(ridx, ridx + m);
2126: tmp = m;
2127: } else {
2128: hipsparsestruct->workVector = NULL;
2129: matstruct->cprowIndices = NULL;
2130: tmp = 0;
2131: }
2132: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2134: /* assign the pointer */
2135: hipsparsestruct->mat = matstruct;
2136: } catch (char *ex) {
2137: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2138: }
2139: PetscCallHIP(WaitForHIP());
2140: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2141: hipsparsestruct->nonzerostate = A->nonzerostate;
2142: }
2143: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2144: }
2145: PetscFunctionReturn(PETSC_SUCCESS);
2146: }
2148: struct VecHIPPlusEquals {
2149: template <typename Tuple>
2150: __host__ __device__ void operator()(Tuple t)
2151: {
2152: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2153: }
2154: };
2156: struct VecHIPEquals {
2157: template <typename Tuple>
2158: __host__ __device__ void operator()(Tuple t)
2159: {
2160: thrust::get<1>(t) = thrust::get<0>(t);
2161: }
2162: };
2164: struct VecHIPEqualsReverse {
2165: template <typename Tuple>
2166: __host__ __device__ void operator()(Tuple t)
2167: {
2168: thrust::get<0>(t) = thrust::get<1>(t);
2169: }
2170: };
2172: struct MatProductCtx_MatMatHipsparse {
2173: PetscBool cisdense;
2174: PetscScalar *Bt;
2175: Mat X;
2176: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2177: PetscLogDouble flops;
2178: CsrMatrix *Bcsr;
2179: hipsparseSpMatDescr_t matSpBDescr;
2180: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2181: hipsparseDnMatDescr_t matBDescr;
2182: hipsparseDnMatDescr_t matCDescr;
2183: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2184: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2185: void *dBuffer4, *dBuffer5;
2186: #endif
2187: size_t mmBufferSize;
2188: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2189: hipsparseSpGEMMDescr_t spgemmDesc;
2190: };
2192: static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2193: {
2194: MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
2196: PetscFunctionBegin;
2197: PetscCallHIP(hipFree(mmdata->Bt));
2198: delete mmdata->Bcsr;
2199: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2200: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2201: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2202: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2203: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2204: if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2205: if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2206: #endif
2207: if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2208: if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2209: PetscCall(MatDestroy(&mmdata->X));
2210: PetscCall(PetscFree(*(void **)data));
2211: PetscFunctionReturn(PETSC_SUCCESS);
2212: }
2214: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2215: {
2216: Mat_Product *product = C->product;
2217: Mat A, B;
2218: PetscInt m, n, blda, clda;
2219: PetscBool flg, biship;
2220: Mat_SeqAIJHIPSPARSE *cusp;
2221: hipsparseOperation_t opA;
2222: const PetscScalar *barray;
2223: PetscScalar *carray;
2224: MatProductCtx_MatMatHipsparse *mmdata;
2225: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2226: CsrMatrix *csrmat;
2228: PetscFunctionBegin;
2229: MatCheckProduct(C, 1);
2230: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2231: mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2232: A = product->A;
2233: B = product->B;
2234: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2235: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2236: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2237: Instead of silently accepting the wrong answer, I prefer to raise the error */
2238: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2239: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2240: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2241: switch (product->type) {
2242: case MATPRODUCT_AB:
2243: case MATPRODUCT_PtAP:
2244: mat = cusp->mat;
2245: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2246: m = A->rmap->n;
2247: n = B->cmap->n;
2248: break;
2249: case MATPRODUCT_AtB:
2250: if (!A->form_explicit_transpose) {
2251: mat = cusp->mat;
2252: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2253: } else {
2254: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2255: mat = cusp->matTranspose;
2256: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2257: }
2258: m = A->cmap->n;
2259: n = B->cmap->n;
2260: break;
2261: case MATPRODUCT_ABt:
2262: case MATPRODUCT_RARt:
2263: mat = cusp->mat;
2264: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2265: m = A->rmap->n;
2266: n = B->rmap->n;
2267: break;
2268: default:
2269: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2270: }
2271: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2272: csrmat = (CsrMatrix *)mat->mat;
2273: /* if the user passed a CPU matrix, copy the data to the GPU */
2274: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2275: if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2276: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2277: PetscCall(MatDenseGetLDA(B, &blda));
2278: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2279: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2280: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2281: } else {
2282: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2283: PetscCall(MatDenseGetLDA(C, &clda));
2284: }
2286: PetscCall(PetscLogGpuTimeBegin());
2287: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2288: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2289: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2290: size_t mmBufferSize;
2291: if (mmdata->initialized && mmdata->Blda != blda) {
2292: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2293: mmdata->matBDescr = NULL;
2294: }
2295: if (!mmdata->matBDescr) {
2296: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2297: mmdata->Blda = blda;
2298: }
2299: if (mmdata->initialized && mmdata->Clda != clda) {
2300: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2301: mmdata->matCDescr = NULL;
2302: }
2303: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2304: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2305: mmdata->Clda = clda;
2306: }
2307: if (!mat->matDescr) {
2308: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2309: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2310: }
2311: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2312: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2313: PetscCallHIP(hipFree(mmdata->mmBuffer));
2314: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2315: mmdata->mmBufferSize = mmBufferSize;
2316: }
2317: mmdata->initialized = PETSC_TRUE;
2318: } else {
2319: /* to be safe, always update pointers of the mats */
2320: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2321: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2322: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2323: }
2325: /* do hipsparseSpMM, which supports transpose on B */
2326: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2328: PetscCall(PetscLogGpuTimeEnd());
2329: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2330: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2331: if (product->type == MATPRODUCT_RARt) {
2332: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2333: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2334: } else if (product->type == MATPRODUCT_PtAP) {
2335: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2336: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2337: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2338: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2339: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2340: PetscFunctionReturn(PETSC_SUCCESS);
2341: }
2343: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2344: {
2345: Mat_Product *product = C->product;
2346: Mat A, B;
2347: PetscInt m, n;
2348: PetscBool cisdense, flg;
2349: MatProductCtx_MatMatHipsparse *mmdata;
2350: Mat_SeqAIJHIPSPARSE *cusp;
2352: PetscFunctionBegin;
2353: MatCheckProduct(C, 1);
2354: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2355: A = product->A;
2356: B = product->B;
2357: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2358: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2359: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2360: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2361: switch (product->type) {
2362: case MATPRODUCT_AB:
2363: m = A->rmap->n;
2364: n = B->cmap->n;
2365: break;
2366: case MATPRODUCT_AtB:
2367: m = A->cmap->n;
2368: n = B->cmap->n;
2369: break;
2370: case MATPRODUCT_ABt:
2371: m = A->rmap->n;
2372: n = B->rmap->n;
2373: break;
2374: case MATPRODUCT_PtAP:
2375: m = B->cmap->n;
2376: n = B->cmap->n;
2377: break;
2378: case MATPRODUCT_RARt:
2379: m = B->rmap->n;
2380: n = B->rmap->n;
2381: break;
2382: default:
2383: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2384: }
2385: PetscCall(MatSetSizes(C, m, n, m, n));
2386: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2387: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2388: PetscCall(MatSetType(C, MATSEQDENSEHIP));
2390: /* product data */
2391: PetscCall(PetscNew(&mmdata));
2392: mmdata->cisdense = cisdense;
2393: /* for these products we need intermediate storage */
2394: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2395: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2396: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2397: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2398: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2399: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2400: }
2401: C->product->data = mmdata;
2402: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2403: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2404: PetscFunctionReturn(PETSC_SUCCESS);
2405: }
2407: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2408: {
2409: Mat_Product *product = C->product;
2410: Mat A, B;
2411: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2412: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2413: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2414: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2415: PetscBool flg;
2416: MatProductType ptype;
2417: MatProductCtx_MatMatHipsparse *mmdata;
2418: hipsparseSpMatDescr_t BmatSpDescr;
2419: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2421: PetscFunctionBegin;
2422: MatCheckProduct(C, 1);
2423: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2424: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2425: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2426: mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2427: A = product->A;
2428: B = product->B;
2429: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2430: mmdata->reusesym = PETSC_FALSE;
2431: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2432: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2433: Cmat = Ccusp->mat;
2434: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2435: Ccsr = (CsrMatrix *)Cmat->mat;
2436: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2437: goto finalize;
2438: }
2439: if (!c->nz) goto finalize;
2440: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2441: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2442: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2443: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2444: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2445: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2446: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2447: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2448: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2449: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2450: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2451: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2452: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2453: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2455: ptype = product->type;
2456: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2457: ptype = MATPRODUCT_AB;
2458: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2459: }
2460: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2461: ptype = MATPRODUCT_AB;
2462: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2463: }
2464: switch (ptype) {
2465: case MATPRODUCT_AB:
2466: Amat = Acusp->mat;
2467: Bmat = Bcusp->mat;
2468: break;
2469: case MATPRODUCT_AtB:
2470: Amat = Acusp->matTranspose;
2471: Bmat = Bcusp->mat;
2472: break;
2473: case MATPRODUCT_ABt:
2474: Amat = Acusp->mat;
2475: Bmat = Bcusp->matTranspose;
2476: break;
2477: default:
2478: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2479: }
2480: Cmat = Ccusp->mat;
2481: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2482: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2483: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2484: Acsr = (CsrMatrix *)Amat->mat;
2485: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2486: Ccsr = (CsrMatrix *)Cmat->mat;
2487: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2488: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2489: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2490: PetscCall(PetscLogGpuTimeBegin());
2491: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2492: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2493: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2494: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2495: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2496: #else
2497: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2498: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2499: #endif
2500: #else
2501: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2502: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2503: Ccsr->column_indices->data().get()));
2504: #endif
2505: PetscCall(PetscLogGpuFlops(mmdata->flops));
2506: PetscCallHIP(WaitForHIP());
2507: PetscCall(PetscLogGpuTimeEnd());
2508: C->offloadmask = PETSC_OFFLOAD_GPU;
2509: finalize:
2510: /* shorter version of MatAssemblyEnd_SeqAIJ */
2511: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2512: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2513: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2514: c->reallocs = 0;
2515: C->info.mallocs += 0;
2516: C->info.nz_unneeded = 0;
2517: C->assembled = C->was_assembled = PETSC_TRUE;
2518: C->num_ass++;
2519: PetscFunctionReturn(PETSC_SUCCESS);
2520: }
2522: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2523: {
2524: Mat_Product *product = C->product;
2525: Mat A, B;
2526: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2527: Mat_SeqAIJ *a, *b, *c;
2528: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2529: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2530: PetscInt i, j, m, n, k;
2531: PetscBool flg;
2532: MatProductType ptype;
2533: MatProductCtx_MatMatHipsparse *mmdata;
2534: PetscLogDouble flops;
2535: PetscBool biscompressed, ciscompressed;
2536: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2537: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2538: hipsparseSpMatDescr_t BmatSpDescr;
2539: #else
2540: int cnz;
2541: #endif
2542: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2544: PetscFunctionBegin;
2545: MatCheckProduct(C, 1);
2546: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2547: A = product->A;
2548: B = product->B;
2549: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2550: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2551: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2552: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2553: a = (Mat_SeqAIJ *)A->data;
2554: b = (Mat_SeqAIJ *)B->data;
2555: /* product data */
2556: PetscCall(PetscNew(&mmdata));
2557: C->product->data = mmdata;
2558: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2560: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2561: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2562: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2563: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2564: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2565: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2567: ptype = product->type;
2568: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2569: ptype = MATPRODUCT_AB;
2570: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2571: }
2572: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2573: ptype = MATPRODUCT_AB;
2574: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2575: }
2576: biscompressed = PETSC_FALSE;
2577: ciscompressed = PETSC_FALSE;
2578: switch (ptype) {
2579: case MATPRODUCT_AB:
2580: m = A->rmap->n;
2581: n = B->cmap->n;
2582: k = A->cmap->n;
2583: Amat = Acusp->mat;
2584: Bmat = Bcusp->mat;
2585: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2586: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2587: break;
2588: case MATPRODUCT_AtB:
2589: m = A->cmap->n;
2590: n = B->cmap->n;
2591: k = A->rmap->n;
2592: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2593: Amat = Acusp->matTranspose;
2594: Bmat = Bcusp->mat;
2595: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2596: break;
2597: case MATPRODUCT_ABt:
2598: m = A->rmap->n;
2599: n = B->rmap->n;
2600: k = A->cmap->n;
2601: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2602: Amat = Acusp->mat;
2603: Bmat = Bcusp->matTranspose;
2604: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2605: break;
2606: default:
2607: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2608: }
2610: /* create hipsparse matrix */
2611: PetscCall(MatSetSizes(C, m, n, m, n));
2612: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2613: c = (Mat_SeqAIJ *)C->data;
2614: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2615: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2616: Ccsr = new CsrMatrix;
2618: c->compressedrow.use = ciscompressed;
2619: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2620: c->compressedrow.nrows = a->compressedrow.nrows;
2621: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2622: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2623: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2624: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2625: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2626: } else {
2627: c->compressedrow.nrows = 0;
2628: c->compressedrow.i = NULL;
2629: c->compressedrow.rindex = NULL;
2630: Ccusp->workVector = NULL;
2631: Cmat->cprowIndices = NULL;
2632: }
2633: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2634: Ccusp->mat = Cmat;
2635: Ccusp->mat->mat = Ccsr;
2636: Ccsr->num_rows = Ccusp->nrows;
2637: Ccsr->num_cols = n;
2638: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2639: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2640: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2641: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2642: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2643: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2644: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2645: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2646: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2647: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2648: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2649: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2650: c->nz = 0;
2651: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2652: Ccsr->values = new THRUSTARRAY(c->nz);
2653: goto finalizesym;
2654: }
2656: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2657: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2658: Acsr = (CsrMatrix *)Amat->mat;
2659: if (!biscompressed) {
2660: Bcsr = (CsrMatrix *)Bmat->mat;
2661: BmatSpDescr = Bmat->matDescr;
2662: } else { /* we need to use row offsets for the full matrix */
2663: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2664: Bcsr = new CsrMatrix;
2665: Bcsr->num_rows = B->rmap->n;
2666: Bcsr->num_cols = cBcsr->num_cols;
2667: Bcsr->num_entries = cBcsr->num_entries;
2668: Bcsr->column_indices = cBcsr->column_indices;
2669: Bcsr->values = cBcsr->values;
2670: if (!Bcusp->rowoffsets_gpu) {
2671: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2672: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2673: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2674: }
2675: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2676: mmdata->Bcsr = Bcsr;
2677: if (Bcsr->num_rows && Bcsr->num_cols) {
2678: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2679: }
2680: BmatSpDescr = mmdata->matSpBDescr;
2681: }
2682: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2683: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2684: /* precompute flops count */
2685: if (ptype == MATPRODUCT_AB) {
2686: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2687: const PetscInt st = a->i[i];
2688: const PetscInt en = a->i[i + 1];
2689: for (j = st; j < en; j++) {
2690: const PetscInt brow = a->j[j];
2691: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2692: }
2693: }
2694: } else if (ptype == MATPRODUCT_AtB) {
2695: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2696: const PetscInt anzi = a->i[i + 1] - a->i[i];
2697: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2698: flops += (2. * anzi) * bnzi;
2699: }
2700: } else flops = 0.; /* TODO */
2702: mmdata->flops = flops;
2703: PetscCall(PetscLogGpuTimeBegin());
2704: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2705: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2706: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2707: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2708: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2709: {
2710: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2711: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2712: */
2713: void *dBuffer1 = NULL;
2714: void *dBuffer2 = NULL;
2715: void *dBuffer3 = NULL;
2716: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2717: size_t bufferSize1 = 0;
2718: size_t bufferSize2 = 0;
2719: size_t bufferSize3 = 0;
2720: size_t bufferSize4 = 0;
2721: size_t bufferSize5 = 0;
2723: /* ask bufferSize1 bytes for external memory */
2724: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2725: PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2726: /* inspect the matrices A and B to understand the memory requirement for the next step */
2727: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2729: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2730: PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2731: PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2732: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2733: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2734: PetscCallHIP(hipFree(dBuffer1));
2735: PetscCallHIP(hipFree(dBuffer2));
2737: /* get matrix C non-zero entries C_nnz1 */
2738: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2739: c->nz = (PetscInt)C_nnz1;
2740: /* allocate matrix C */
2741: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2742: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2743: Ccsr->values = new THRUSTARRAY(c->nz);
2744: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2745: /* update matC with the new pointers */
2746: if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2747: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2749: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2750: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2751: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2752: PetscCallHIP(hipFree(dBuffer3));
2753: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2754: }
2755: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2756: }
2757: #else
2758: size_t bufSize2;
2759: /* ask bufferSize bytes for external memory */
2760: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2761: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2762: /* inspect the matrices A and B to understand the memory requirement for the next step */
2763: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2764: /* ask bufferSize again bytes for external memory */
2765: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2766: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2767: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2768: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2769: is stored in the descriptor! What a messy API... */
2770: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2771: /* compute the intermediate product of A * B */
2772: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2773: /* get matrix C non-zero entries C_nnz1 */
2774: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2775: c->nz = (PetscInt)C_nnz1;
2776: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2777: mmdata->mmBufferSize / 1024));
2778: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2779: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2780: Ccsr->values = new THRUSTARRAY(c->nz);
2781: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2782: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2783: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2784: #endif
2785: #else
2786: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2787: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2788: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2789: c->nz = cnz;
2790: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2791: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2792: Ccsr->values = new THRUSTARRAY(c->nz);
2793: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2795: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2796: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2797: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2798: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2799: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2800: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2801: Ccsr->column_indices->data().get()));
2802: #endif
2803: PetscCall(PetscLogGpuFlops(mmdata->flops));
2804: PetscCall(PetscLogGpuTimeEnd());
2805: finalizesym:
2806: c->free_a = PETSC_TRUE;
2807: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2808: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2809: c->free_ij = PETSC_TRUE;
2810: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2811: PetscInt *d_i = c->i;
2812: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2813: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2814: ii = *Ccsr->row_offsets;
2815: jj = *Ccsr->column_indices;
2816: if (ciscompressed) d_i = c->compressedrow.i;
2817: PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2818: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2819: } else {
2820: PetscInt *d_i = c->i;
2821: if (ciscompressed) d_i = c->compressedrow.i;
2822: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2823: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824: }
2825: if (ciscompressed) { /* need to expand host row offsets */
2826: PetscInt r = 0;
2827: c->i[0] = 0;
2828: for (k = 0; k < c->compressedrow.nrows; k++) {
2829: const PetscInt next = c->compressedrow.rindex[k];
2830: const PetscInt old = c->compressedrow.i[k];
2831: for (; r < next; r++) c->i[r + 1] = old;
2832: }
2833: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2834: }
2835: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2836: PetscCall(PetscMalloc1(m, &c->ilen));
2837: PetscCall(PetscMalloc1(m, &c->imax));
2838: c->maxnz = c->nz;
2839: c->nonzerorowcnt = 0;
2840: c->rmax = 0;
2841: for (k = 0; k < m; k++) {
2842: const PetscInt nn = c->i[k + 1] - c->i[k];
2843: c->ilen[k] = c->imax[k] = nn;
2844: c->nonzerorowcnt += (PetscInt)!!nn;
2845: c->rmax = PetscMax(c->rmax, nn);
2846: }
2847: PetscCall(PetscMalloc1(c->nz, &c->a));
2848: Ccsr->num_entries = c->nz;
2850: C->nonzerostate++;
2851: PetscCall(PetscLayoutSetUp(C->rmap));
2852: PetscCall(PetscLayoutSetUp(C->cmap));
2853: Ccusp->nonzerostate = C->nonzerostate;
2854: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2855: C->preallocated = PETSC_TRUE;
2856: C->assembled = PETSC_FALSE;
2857: C->was_assembled = PETSC_FALSE;
2858: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2859: mmdata->reusesym = PETSC_TRUE;
2860: C->offloadmask = PETSC_OFFLOAD_GPU;
2861: }
2862: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2863: PetscFunctionReturn(PETSC_SUCCESS);
2864: }
2866: /* handles sparse or dense B */
2867: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2868: {
2869: Mat_Product *product = mat->product;
2870: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2872: PetscFunctionBegin;
2873: MatCheckProduct(mat, 1);
2874: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2875: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2876: if (product->type == MATPRODUCT_ABC) {
2877: Ciscusp = PETSC_FALSE;
2878: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2879: }
2880: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2881: PetscBool usecpu = PETSC_FALSE;
2882: switch (product->type) {
2883: case MATPRODUCT_AB:
2884: if (product->api_user) {
2885: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2886: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2887: PetscOptionsEnd();
2888: } else {
2889: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2890: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2891: PetscOptionsEnd();
2892: }
2893: break;
2894: case MATPRODUCT_AtB:
2895: if (product->api_user) {
2896: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2897: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2898: PetscOptionsEnd();
2899: } else {
2900: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2901: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2902: PetscOptionsEnd();
2903: }
2904: break;
2905: case MATPRODUCT_PtAP:
2906: if (product->api_user) {
2907: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2908: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2909: PetscOptionsEnd();
2910: } else {
2911: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2912: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2913: PetscOptionsEnd();
2914: }
2915: break;
2916: case MATPRODUCT_RARt:
2917: if (product->api_user) {
2918: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2919: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2920: PetscOptionsEnd();
2921: } else {
2922: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2923: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2924: PetscOptionsEnd();
2925: }
2926: break;
2927: case MATPRODUCT_ABC:
2928: if (product->api_user) {
2929: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2930: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2931: PetscOptionsEnd();
2932: } else {
2933: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2934: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2935: PetscOptionsEnd();
2936: }
2937: break;
2938: default:
2939: break;
2940: }
2941: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2942: }
2943: /* dispatch */
2944: if (isdense) {
2945: switch (product->type) {
2946: case MATPRODUCT_AB:
2947: case MATPRODUCT_AtB:
2948: case MATPRODUCT_ABt:
2949: case MATPRODUCT_PtAP:
2950: case MATPRODUCT_RARt:
2951: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2952: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2953: break;
2954: case MATPRODUCT_ABC:
2955: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2956: break;
2957: default:
2958: break;
2959: }
2960: } else if (Biscusp && Ciscusp) {
2961: switch (product->type) {
2962: case MATPRODUCT_AB:
2963: case MATPRODUCT_AtB:
2964: case MATPRODUCT_ABt:
2965: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2966: break;
2967: case MATPRODUCT_PtAP:
2968: case MATPRODUCT_RARt:
2969: case MATPRODUCT_ABC:
2970: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971: break;
2972: default:
2973: break;
2974: }
2975: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2976: PetscFunctionReturn(PETSC_SUCCESS);
2977: }
2979: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2980: {
2981: PetscFunctionBegin;
2982: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2983: PetscFunctionReturn(PETSC_SUCCESS);
2984: }
2986: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2987: {
2988: PetscFunctionBegin;
2989: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2990: PetscFunctionReturn(PETSC_SUCCESS);
2991: }
2993: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2994: {
2995: PetscFunctionBegin;
2996: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2997: PetscFunctionReturn(PETSC_SUCCESS);
2998: }
3000: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3001: {
3002: PetscFunctionBegin;
3003: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3004: PetscFunctionReturn(PETSC_SUCCESS);
3005: }
3007: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3008: {
3009: PetscFunctionBegin;
3010: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3011: PetscFunctionReturn(PETSC_SUCCESS);
3012: }
3014: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3015: {
3016: int i = blockIdx.x * blockDim.x + threadIdx.x;
3017: if (i < n) y[idx[i]] += x[i];
3018: }
3020: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3021: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3022: {
3023: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3024: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3025: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3026: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3027: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3028: PetscBool compressed;
3029: PetscInt nx, ny;
3031: PetscFunctionBegin;
3032: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3033: if (!a->nz) {
3034: if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3035: else PetscCall(VecSeq_HIP::Set(zz, 0));
3036: PetscFunctionReturn(PETSC_SUCCESS);
3037: }
3038: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3039: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3040: if (!trans) {
3041: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3042: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3043: } else {
3044: if (herm || !A->form_explicit_transpose) {
3045: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3046: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3047: } else {
3048: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3049: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3050: }
3051: }
3052: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3053: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3054: try {
3055: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3056: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3057: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3059: PetscCall(PetscLogGpuTimeBegin());
3060: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3061: /* z = A x + beta y.
3062: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3063: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3064: */
3065: xptr = xarray;
3066: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3067: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3068: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3069: allocated to accommodate different uses. So we get the length info directly from mat.
3070: */
3071: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3072: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3073: nx = mat->num_cols;
3074: ny = mat->num_rows;
3075: }
3076: } else {
3077: /* z = A^T x + beta y
3078: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3079: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3080: */
3081: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3082: dptr = zarray;
3083: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3084: if (compressed) { /* Scatter x to work vector */
3085: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3086: thrust::for_each(
3087: #if PetscDefined(HAVE_THRUST_ASYNC)
3088: thrust::hip::par.on(PetscDefaultHipStream),
3089: #endif
3090: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3091: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3092: }
3093: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3094: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3095: nx = mat->num_rows;
3096: ny = mat->num_cols;
3097: }
3098: }
3099: /* csr_spmv does y = alpha op(A) x + beta y */
3100: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3101: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !PETSC_PKG_HIP_VERSION_EQ(7, 2, 0)
3102: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3103: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3104: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3105: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3106: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3107: &matstruct->hipSpMV[opA].spmvBufferSize));
3108: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3109: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3110: } else {
3111: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3112: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3113: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3114: }
3115: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3116: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3117: #else
3118: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3119: nx = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
3120: ny = mat->num_cols;
3121: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3122: #endif
3123: } else {
3124: if (hipsparsestruct->nrows) {
3125: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3126: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3127: }
3128: }
3129: PetscCall(PetscLogGpuTimeEnd());
3131: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3132: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3133: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3134: PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
3135: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3136: PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3137: }
3138: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3139: PetscCall(VecSeq_HIP::Set(zz, 0));
3140: }
3142: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3143: if (compressed) {
3144: PetscCall(PetscLogGpuTimeBegin());
3145: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3146: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3147: prevent that. So I just add a ScatterAdd kernel.
3148: */
3149: #if 0
3150: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3151: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3152: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3153: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3154: VecHIPPlusEquals());
3155: #else
3156: PetscInt n = matstruct->cprowIndices->size();
3157: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3158: #endif
3159: PetscCall(PetscLogGpuTimeEnd());
3160: }
3161: } else {
3162: if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3163: }
3164: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3165: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3166: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3167: } catch (char *ex) {
3168: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3169: }
3170: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3171: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3172: PetscFunctionReturn(PETSC_SUCCESS);
3173: }
3175: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3176: {
3177: PetscFunctionBegin;
3178: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3179: PetscFunctionReturn(PETSC_SUCCESS);
3180: }
3182: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3183: {
3184: PetscFunctionBegin;
3185: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3186: PetscFunctionReturn(PETSC_SUCCESS);
3187: }
3189: /*@
3190: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3191: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3193: Collective
3195: Input Parameters:
3196: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3197: . m - number of rows
3198: . n - number of columns
3199: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3200: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3202: Output Parameter:
3203: . A - the matrix
3205: Level: intermediate
3207: Notes:
3208: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3209: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3210: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3212: The AIJ format (compressed row storage), is fully compatible with standard Fortran
3213: storage. That is, the stored row and column indices can begin at
3214: either one (as in Fortran) or zero.
3216: Specify the preallocated storage with either `nz` or `nnz` (not both).
3217: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3218: allocation.
3220: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3221: @*/
3222: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3223: {
3224: PetscFunctionBegin;
3225: PetscCall(MatCreate(comm, A));
3226: PetscCall(MatSetSizes(*A, m, n, m, n));
3227: PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3228: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3229: PetscFunctionReturn(PETSC_SUCCESS);
3230: }
3232: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3233: {
3234: PetscFunctionBegin;
3235: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3236: else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3237: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3238: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3239: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3240: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3241: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3242: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3243: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3244: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3245: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3246: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3247: PetscCall(MatDestroy_SeqAIJ(A));
3248: PetscFunctionReturn(PETSC_SUCCESS);
3249: }
3251: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3252: {
3253: PetscFunctionBegin;
3254: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3255: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3256: PetscFunctionReturn(PETSC_SUCCESS);
3257: }
3259: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3260: {
3261: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3262: Mat_SeqAIJHIPSPARSE *cy;
3263: Mat_SeqAIJHIPSPARSE *cx;
3264: PetscScalar *ay;
3265: const PetscScalar *ax;
3266: CsrMatrix *csry, *csrx;
3268: PetscFunctionBegin;
3269: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3270: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3271: if (X->ops->axpy != Y->ops->axpy) {
3272: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3273: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3274: PetscFunctionReturn(PETSC_SUCCESS);
3275: }
3276: /* if we are here, it means both matrices are bound to GPU */
3277: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3278: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3279: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3280: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3281: csry = (CsrMatrix *)cy->mat->mat;
3282: csrx = (CsrMatrix *)cx->mat->mat;
3283: /* see if we can turn this into a hipblas axpy */
3284: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3285: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3286: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3287: if (eq) str = SAME_NONZERO_PATTERN;
3288: }
3289: /* spgeam is buggy with one column */
3290: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3291: if (str == SUBSET_NONZERO_PATTERN) {
3292: PetscScalar b = 1.0;
3293: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3294: size_t bufferSize;
3295: void *buffer;
3296: #endif
3298: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3299: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3300: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3301: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3302: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3303: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3304: PetscCallHIP(hipMalloc(&buffer, bufferSize));
3305: PetscCall(PetscLogGpuTimeBegin());
3306: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3307: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3308: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3309: PetscCall(PetscLogGpuTimeEnd());
3310: PetscCallHIP(hipFree(buffer));
3311: #else
3312: PetscCall(PetscLogGpuTimeBegin());
3313: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3314: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3315: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3316: PetscCall(PetscLogGpuTimeEnd());
3317: #endif
3318: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3319: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3320: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3321: } else if (str == SAME_NONZERO_PATTERN) {
3322: hipblasHandle_t hipblasv2handle;
3323: PetscBLASInt one = 1, bnz = 1;
3325: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3326: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3327: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3328: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3329: PetscCall(PetscLogGpuTimeBegin());
3330: PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3331: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3332: PetscCall(PetscLogGpuTimeEnd());
3333: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3334: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3335: } else {
3336: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3337: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3338: }
3339: PetscFunctionReturn(PETSC_SUCCESS);
3340: }
3342: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3343: {
3344: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3345: PetscScalar *ay;
3346: hipblasHandle_t hipblasv2handle;
3347: PetscBLASInt one = 1, bnz = 1;
3349: PetscFunctionBegin;
3350: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3351: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3352: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3353: PetscCall(PetscLogGpuTimeBegin());
3354: PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3355: PetscCall(PetscLogGpuFlops(bnz));
3356: PetscCall(PetscLogGpuTimeEnd());
3357: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3358: PetscFunctionReturn(PETSC_SUCCESS);
3359: }
3361: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3362: {
3363: PetscBool both = PETSC_FALSE;
3364: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3366: PetscFunctionBegin;
3367: if (A->factortype == MAT_FACTOR_NONE) {
3368: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3369: if (spptr->mat) {
3370: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3371: if (matrix->values) {
3372: both = PETSC_TRUE;
3373: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3374: }
3375: }
3376: if (spptr->matTranspose) {
3377: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3378: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3379: }
3380: }
3381: //PetscCall(MatZeroEntries_SeqAIJ(A));
3382: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3383: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3384: else A->offloadmask = PETSC_OFFLOAD_CPU;
3385: PetscFunctionReturn(PETSC_SUCCESS);
3386: }
3388: static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3389: {
3390: PetscFunctionBegin;
3391: *m = PETSC_MEMTYPE_HIP;
3392: PetscFunctionReturn(PETSC_SUCCESS);
3393: }
3395: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3396: {
3397: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3399: PetscFunctionBegin;
3400: if (A->factortype != MAT_FACTOR_NONE) {
3401: A->boundtocpu = flg;
3402: PetscFunctionReturn(PETSC_SUCCESS);
3403: }
3404: if (flg) {
3405: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3407: A->ops->scale = MatScale_SeqAIJ;
3408: A->ops->axpy = MatAXPY_SeqAIJ;
3409: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3410: A->ops->mult = MatMult_SeqAIJ;
3411: A->ops->multadd = MatMultAdd_SeqAIJ;
3412: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3413: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3414: A->ops->multhermitiantranspose = NULL;
3415: A->ops->multhermitiantransposeadd = NULL;
3416: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3417: A->ops->getcurrentmemtype = NULL;
3418: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3419: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3420: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3421: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3422: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3423: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3424: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3425: } else {
3426: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3427: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3428: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3429: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3430: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3431: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3432: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3433: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3434: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3435: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3436: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3437: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3438: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3439: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3440: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3441: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3442: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3443: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3444: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3445: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3446: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3447: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3448: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3449: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3450: }
3451: A->boundtocpu = flg;
3452: if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3453: else a->inode.use = PETSC_FALSE;
3454: PetscFunctionReturn(PETSC_SUCCESS);
3455: }
3457: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3458: {
3459: Mat B;
3461: PetscFunctionBegin;
3462: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3463: if (reuse == MAT_INITIAL_MATRIX) {
3464: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3465: } else if (reuse == MAT_REUSE_MATRIX) {
3466: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3467: }
3468: B = *newmat;
3469: PetscCall(PetscFree(B->defaultvectype));
3470: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3471: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3472: if (B->factortype == MAT_FACTOR_NONE) {
3473: Mat_SeqAIJHIPSPARSE *spptr;
3474: PetscCall(PetscNew(&spptr));
3475: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3476: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3477: spptr->format = MAT_HIPSPARSE_CSR;
3478: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3479: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3480: #else
3481: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3482: #endif
3483: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3484: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3486: B->spptr = spptr;
3487: } else {
3488: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3490: PetscCall(PetscNew(&spptr));
3491: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3492: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3493: B->spptr = spptr;
3494: }
3495: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3496: }
3497: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3498: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3499: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3500: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3501: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3502: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3503: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3505: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3506: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3507: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3508: #if defined(PETSC_HAVE_HYPRE)
3509: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3510: #endif
3511: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3512: PetscFunctionReturn(PETSC_SUCCESS);
3513: }
3515: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3516: {
3517: PetscFunctionBegin;
3518: PetscCall(MatCreate_SeqAIJ(B));
3519: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3520: PetscFunctionReturn(PETSC_SUCCESS);
3521: }
3523: /*MC
3524: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3526: A matrix type whose data resides on AMD GPUs. These matrices can be in either
3527: CSR, ELL, or Hybrid format.
3528: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3530: Options Database Keys:
3531: + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3532: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3533: Other options include ell (ellpack) or hyb (hybrid).
3534: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3535: - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3537: Level: beginner
3539: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3540: M*/
3542: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3543: {
3544: PetscFunctionBegin;
3545: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3546: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3547: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3548: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3549: PetscFunctionReturn(PETSC_SUCCESS);
3550: }
3552: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3553: {
3554: Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3556: PetscFunctionBegin;
3557: if (cusp) {
3558: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3559: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3560: delete cusp->workVector;
3561: delete cusp->rowoffsets_gpu;
3562: delete cusp->csr2csc_i;
3563: delete cusp->coords;
3564: if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3565: PetscCall(PetscFree(mat->spptr));
3566: }
3567: PetscFunctionReturn(PETSC_SUCCESS);
3568: }
3570: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571: {
3572: PetscFunctionBegin;
3573: if (*mat) {
3574: delete (*mat)->values;
3575: delete (*mat)->column_indices;
3576: delete (*mat)->row_offsets;
3577: delete *mat;
3578: *mat = 0;
3579: }
3580: PetscFunctionReturn(PETSC_SUCCESS);
3581: }
3583: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3584: {
3585: PetscFunctionBegin;
3586: if (*trifactor) {
3587: if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3588: if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3589: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3590: if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3591: if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3592: if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3593: PetscCall(PetscFree(*trifactor));
3594: }
3595: PetscFunctionReturn(PETSC_SUCCESS);
3596: }
3598: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3599: {
3600: CsrMatrix *mat;
3602: PetscFunctionBegin;
3603: if (*matstruct) {
3604: if ((*matstruct)->mat) {
3605: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3606: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3607: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3608: } else {
3609: mat = (CsrMatrix *)(*matstruct)->mat;
3610: PetscCall(CsrMatrix_Destroy(&mat));
3611: }
3612: }
3613: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3614: delete (*matstruct)->cprowIndices;
3615: if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3616: if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3617: if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3619: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3620: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3621: for (int i = 0; i < 3; i++) {
3622: if (mdata->hipSpMV[i].initialized) {
3623: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3624: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3625: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3626: }
3627: }
3628: delete *matstruct;
3629: *matstruct = NULL;
3630: }
3631: PetscFunctionReturn(PETSC_SUCCESS);
3632: }
3634: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3635: {
3636: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3638: PetscFunctionBegin;
3639: if (fs) {
3640: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3641: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3642: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3643: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3644: delete fs->rpermIndices;
3645: delete fs->cpermIndices;
3646: delete fs->workVector;
3647: fs->rpermIndices = NULL;
3648: fs->cpermIndices = NULL;
3649: fs->workVector = NULL;
3650: fs->init_dev_prop = PETSC_FALSE;
3651: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3652: PetscCallHIP(hipFree(fs->csrRowPtr));
3653: PetscCallHIP(hipFree(fs->csrColIdx));
3654: PetscCallHIP(hipFree(fs->csrVal));
3655: PetscCallHIP(hipFree(fs->X));
3656: PetscCallHIP(hipFree(fs->Y));
3657: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3658: PetscCallHIP(hipFree(fs->spsvBuffer_L));
3659: PetscCallHIP(hipFree(fs->spsvBuffer_U));
3660: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3661: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3662: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3663: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3664: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3665: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3666: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3667: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3668: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3669: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3670: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3671: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3672: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3674: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3675: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3676: #endif
3677: }
3678: PetscFunctionReturn(PETSC_SUCCESS);
3679: }
3681: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3682: {
3683: hipsparseHandle_t handle;
3685: PetscFunctionBegin;
3686: if (*trifactors) {
3687: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3688: if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3689: PetscCall(PetscFree(*trifactors));
3690: }
3691: PetscFunctionReturn(PETSC_SUCCESS);
3692: }
3694: struct IJCompare {
3695: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3696: {
3697: if (t1.get<0>() < t2.get<0>()) return true;
3698: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3699: return false;
3700: }
3701: };
3703: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3704: {
3705: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3707: PetscFunctionBegin;
3708: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3709: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3710: if (destroy) {
3711: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3712: delete cusp->csr2csc_i;
3713: cusp->csr2csc_i = NULL;
3714: }
3715: A->transupdated = PETSC_FALSE;
3716: PetscFunctionReturn(PETSC_SUCCESS);
3717: }
3719: static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)
3720: {
3721: MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data;
3723: PetscFunctionBegin;
3724: PetscCallHIP(hipFree(coo->perm));
3725: PetscCallHIP(hipFree(coo->jmap));
3726: PetscCall(PetscFree(coo));
3727: PetscFunctionReturn(PETSC_SUCCESS);
3728: }
3730: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3731: {
3732: PetscBool dev_ij = PETSC_FALSE;
3733: PetscMemType mtype = PETSC_MEMTYPE_HOST;
3734: PetscInt *i, *j;
3735: PetscContainer container_h;
3736: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3738: PetscFunctionBegin;
3739: PetscCall(PetscGetMemType(coo_i, &mtype));
3740: if (PetscMemTypeDevice(mtype)) {
3741: dev_ij = PETSC_TRUE;
3742: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3743: PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3744: PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3745: } else {
3746: i = coo_i;
3747: j = coo_j;
3748: }
3749: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3750: if (dev_ij) PetscCall(PetscFree2(i, j));
3751: mat->offloadmask = PETSC_OFFLOAD_CPU;
3752: // Create the GPU memory
3753: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3755: // Copy the COO struct to device
3756: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3757: PetscCall(PetscContainerGetPointer(container_h, &coo_h));
3758: PetscCall(PetscMalloc1(1, &coo_d));
3759: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3760: PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3761: PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3762: PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3763: PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3765: // Put the COO struct in a container and then attach that to the matrix
3766: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3767: PetscFunctionReturn(PETSC_SUCCESS);
3768: }
3770: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3771: {
3772: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3773: const PetscCount grid_size = gridDim.x * blockDim.x;
3774: for (; i < nnz; i += grid_size) {
3775: PetscScalar sum = 0.0;
3776: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3777: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3778: }
3779: }
3781: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3782: {
3783: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3784: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3785: PetscCount Annz = seq->nz;
3786: PetscMemType memtype;
3787: const PetscScalar *v1 = v;
3788: PetscScalar *Aa;
3789: PetscContainer container;
3790: MatCOOStruct_SeqAIJ *coo;
3792: PetscFunctionBegin;
3793: if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3795: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3796: PetscCall(PetscContainerGetPointer(container, &coo));
3798: PetscCall(PetscGetMemType(v, &memtype));
3799: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3800: PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3801: PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3802: }
3804: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3805: else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3807: PetscCall(PetscLogGpuTimeBegin());
3808: if (Annz) {
3809: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3810: PetscCallHIP(hipPeekAtLastError());
3811: }
3812: PetscCall(PetscLogGpuTimeEnd());
3814: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3815: else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3817: if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3818: PetscFunctionReturn(PETSC_SUCCESS);
3819: }
3821: /*@C
3822: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3824: Not Collective
3826: Input Parameters:
3827: + A - the matrix
3828: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3830: Output Parameters:
3831: + i - the CSR row pointers
3832: - j - the CSR column indices
3834: Level: developer
3836: Note:
3837: When compressed is true, the CSR structure does not contain empty rows
3839: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3840: @*/
3841: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3842: {
3843: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3844: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3845: CsrMatrix *csr;
3847: PetscFunctionBegin;
3849: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3850: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3851: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3852: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3853: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3854: csr = (CsrMatrix *)cusp->mat->mat;
3855: if (i) {
3856: if (!compressed && a->compressedrow.use) { /* need full row offset */
3857: if (!cusp->rowoffsets_gpu) {
3858: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3859: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3860: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3861: }
3862: *i = cusp->rowoffsets_gpu->data().get();
3863: } else *i = csr->row_offsets->data().get();
3864: }
3865: if (j) *j = csr->column_indices->data().get();
3866: PetscFunctionReturn(PETSC_SUCCESS);
3867: }
3869: /*@C
3870: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3872: Not Collective
3874: Input Parameters:
3875: + A - the matrix
3876: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3877: . i - the CSR row pointers
3878: - j - the CSR column indices
3880: Level: developer
3882: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3883: @*/
3884: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3885: {
3886: PetscFunctionBegin;
3888: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3889: if (i) *i = NULL;
3890: if (j) *j = NULL;
3891: PetscFunctionReturn(PETSC_SUCCESS);
3892: }
3894: /*@C
3895: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3897: Not Collective
3899: Input Parameter:
3900: . A - a `MATSEQAIJHIPSPARSE` matrix
3902: Output Parameter:
3903: . a - pointer to the device data
3905: Level: developer
3907: Note:
3908: May trigger host-device copies if the up-to-date matrix data is on host
3910: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3911: @*/
3912: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3913: {
3914: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3915: CsrMatrix *csr;
3917: PetscFunctionBegin;
3919: PetscAssertPointer(a, 2);
3920: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3921: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3922: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3923: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3924: csr = (CsrMatrix *)cusp->mat->mat;
3925: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3926: *a = csr->values->data().get();
3927: PetscFunctionReturn(PETSC_SUCCESS);
3928: }
3930: /*@C
3931: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3933: Not Collective
3935: Input Parameters:
3936: + A - a `MATSEQAIJHIPSPARSE` matrix
3937: - a - pointer to the device data
3939: Level: developer
3941: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3942: @*/
3943: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3944: {
3945: PetscFunctionBegin;
3947: PetscAssertPointer(a, 2);
3948: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3949: *a = NULL;
3950: PetscFunctionReturn(PETSC_SUCCESS);
3951: }
3953: /*@C
3954: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3956: Not Collective
3958: Input Parameter:
3959: . A - a `MATSEQAIJHIPSPARSE` matrix
3961: Output Parameter:
3962: . a - pointer to the device data
3964: Level: developer
3966: Note:
3967: May trigger host-device copies if up-to-date matrix data is on host
3969: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3970: @*/
3971: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3972: {
3973: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3974: CsrMatrix *csr;
3976: PetscFunctionBegin;
3978: PetscAssertPointer(a, 2);
3979: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3980: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3981: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3982: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3983: csr = (CsrMatrix *)cusp->mat->mat;
3984: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3985: *a = csr->values->data().get();
3986: A->offloadmask = PETSC_OFFLOAD_GPU;
3987: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3988: PetscFunctionReturn(PETSC_SUCCESS);
3989: }
3990: /*@C
3991: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3993: Not Collective
3995: Input Parameters:
3996: + A - a `MATSEQAIJHIPSPARSE` matrix
3997: - a - pointer to the device data
3999: Level: developer
4001: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4002: @*/
4003: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4004: {
4005: PetscFunctionBegin;
4007: PetscAssertPointer(a, 2);
4008: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4009: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4010: *a = NULL;
4011: PetscFunctionReturn(PETSC_SUCCESS);
4012: }
4014: /*@C
4015: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4017: Not Collective
4019: Input Parameter:
4020: . A - a `MATSEQAIJHIPSPARSE` matrix
4022: Output Parameter:
4023: . a - pointer to the device data
4025: Level: developer
4027: Note:
4028: Does not trigger host-device copies and flags data validity on the GPU
4030: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4031: @*/
4032: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4033: {
4034: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4035: CsrMatrix *csr;
4037: PetscFunctionBegin;
4039: PetscAssertPointer(a, 2);
4040: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4041: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4042: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4043: csr = (CsrMatrix *)cusp->mat->mat;
4044: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4045: *a = csr->values->data().get();
4046: A->offloadmask = PETSC_OFFLOAD_GPU;
4047: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4048: PetscFunctionReturn(PETSC_SUCCESS);
4049: }
4051: /*@C
4052: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4054: Not Collective
4056: Input Parameters:
4057: + A - a `MATSEQAIJHIPSPARSE` matrix
4058: - a - pointer to the device data
4060: Level: developer
4062: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4063: @*/
4064: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4065: {
4066: PetscFunctionBegin;
4068: PetscAssertPointer(a, 2);
4069: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4070: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4071: *a = NULL;
4072: PetscFunctionReturn(PETSC_SUCCESS);
4073: }
4075: struct IJCompare4 {
4076: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4077: {
4078: if (t1.get<0>() < t2.get<0>()) return true;
4079: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4080: return false;
4081: }
4082: };
4084: struct Shift {
4085: int _shift;
4087: Shift(int shift) : _shift(shift) { }
4088: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4089: };
4091: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4092: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4093: {
4094: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4095: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4096: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4097: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4098: PetscInt Annz, Bnnz;
4099: PetscInt i, m, n, zero = 0;
4101: PetscFunctionBegin;
4104: PetscAssertPointer(C, 4);
4105: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4106: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4107: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4108: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4109: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4110: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4111: if (reuse == MAT_INITIAL_MATRIX) {
4112: m = A->rmap->n;
4113: n = A->cmap->n + B->cmap->n;
4114: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4115: PetscCall(MatSetSizes(*C, m, n, m, n));
4116: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4117: c = (Mat_SeqAIJ *)(*C)->data;
4118: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4119: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4120: Ccsr = new CsrMatrix;
4121: Cmat->cprowIndices = NULL;
4122: c->compressedrow.use = PETSC_FALSE;
4123: c->compressedrow.nrows = 0;
4124: c->compressedrow.i = NULL;
4125: c->compressedrow.rindex = NULL;
4126: Ccusp->workVector = NULL;
4127: Ccusp->nrows = m;
4128: Ccusp->mat = Cmat;
4129: Ccusp->mat->mat = Ccsr;
4130: Ccsr->num_rows = m;
4131: Ccsr->num_cols = n;
4132: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4133: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4134: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4135: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4136: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4137: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4138: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4139: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4140: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4141: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4142: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4143: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4144: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4146: Acsr = (CsrMatrix *)Acusp->mat->mat;
4147: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4148: Annz = (PetscInt)Acsr->column_indices->size();
4149: Bnnz = (PetscInt)Bcsr->column_indices->size();
4150: c->nz = Annz + Bnnz;
4151: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4152: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4153: Ccsr->values = new THRUSTARRAY(c->nz);
4154: Ccsr->num_entries = c->nz;
4155: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4156: if (c->nz) {
4157: auto Acoo = new THRUSTINTARRAY32(Annz);
4158: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4159: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4160: THRUSTINTARRAY32 *Aroff, *Broff;
4162: if (a->compressedrow.use) { /* need full row offset */
4163: if (!Acusp->rowoffsets_gpu) {
4164: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4165: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4166: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4167: }
4168: Aroff = Acusp->rowoffsets_gpu;
4169: } else Aroff = Acsr->row_offsets;
4170: if (b->compressedrow.use) { /* need full row offset */
4171: if (!Bcusp->rowoffsets_gpu) {
4172: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4173: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4174: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4175: }
4176: Broff = Bcusp->rowoffsets_gpu;
4177: } else Broff = Bcsr->row_offsets;
4178: PetscCall(PetscLogGpuTimeBegin());
4179: PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4180: PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4181: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4182: auto Aperm = thrust::make_constant_iterator(1);
4183: auto Bperm = thrust::make_constant_iterator(0);
4184: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4185: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4186: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4187: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4188: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4189: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4190: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4191: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4192: auto p1 = Ccusp->coords->begin();
4193: auto p2 = Ccusp->coords->begin();
4194: thrust::advance(p2, Annz);
4195: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4196: auto cci = thrust::make_counting_iterator(zero);
4197: auto cce = thrust::make_counting_iterator(c->nz);
4198: #if 0 //Errors on SUMMIT cuda 11.1.0
4199: PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4200: #else
4201: auto pred = [](const int &x) { return x; };
4202: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4203: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4204: #endif
4205: PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4206: PetscCall(PetscLogGpuTimeEnd());
4207: delete wPerm;
4208: delete Acoo;
4209: delete Bcoo;
4210: delete Ccoo;
4211: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4213: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4214: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4215: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4216: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4217: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4218: CsrMatrix *CcsrT = new CsrMatrix;
4219: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4220: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4222: (*C)->form_explicit_transpose = PETSC_TRUE;
4223: (*C)->transupdated = PETSC_TRUE;
4224: Ccusp->rowoffsets_gpu = NULL;
4225: CmatT->cprowIndices = NULL;
4226: CmatT->mat = CcsrT;
4227: CcsrT->num_rows = n;
4228: CcsrT->num_cols = m;
4229: CcsrT->num_entries = c->nz;
4230: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4231: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4232: CcsrT->values = new THRUSTARRAY(c->nz);
4234: PetscCall(PetscLogGpuTimeBegin());
4235: auto rT = CcsrT->row_offsets->begin();
4236: if (AT) {
4237: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4238: thrust::advance(rT, -1);
4239: }
4240: if (BT) {
4241: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4242: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4243: thrust::copy(titb, tite, rT);
4244: }
4245: auto cT = CcsrT->column_indices->begin();
4246: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4247: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4248: auto vT = CcsrT->values->begin();
4249: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4250: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4251: PetscCall(PetscLogGpuTimeEnd());
4253: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4254: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4255: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4256: PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4257: PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4258: PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4259: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4260: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4261: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4263: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4264: Ccusp->matTranspose = CmatT;
4265: }
4266: }
4268: c->free_a = PETSC_TRUE;
4269: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4270: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4271: c->free_ij = PETSC_TRUE;
4272: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4273: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4274: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4275: ii = *Ccsr->row_offsets;
4276: jj = *Ccsr->column_indices;
4277: PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4278: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4279: } else {
4280: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4281: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4282: }
4283: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4284: PetscCall(PetscMalloc1(m, &c->ilen));
4285: PetscCall(PetscMalloc1(m, &c->imax));
4286: c->maxnz = c->nz;
4287: c->nonzerorowcnt = 0;
4288: c->rmax = 0;
4289: for (i = 0; i < m; i++) {
4290: const PetscInt nn = c->i[i + 1] - c->i[i];
4291: c->ilen[i] = c->imax[i] = nn;
4292: c->nonzerorowcnt += (PetscInt)!!nn;
4293: c->rmax = PetscMax(c->rmax, nn);
4294: }
4295: PetscCall(PetscMalloc1(c->nz, &c->a));
4296: (*C)->nonzerostate++;
4297: PetscCall(PetscLayoutSetUp((*C)->rmap));
4298: PetscCall(PetscLayoutSetUp((*C)->cmap));
4299: Ccusp->nonzerostate = (*C)->nonzerostate;
4300: (*C)->preallocated = PETSC_TRUE;
4301: } else {
4302: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4303: c = (Mat_SeqAIJ *)(*C)->data;
4304: if (c->nz) {
4305: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4306: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4307: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4308: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4309: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4310: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4311: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4312: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4313: Acsr = (CsrMatrix *)Acusp->mat->mat;
4314: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4315: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4316: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4317: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4318: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4319: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4320: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4321: auto pmid = Ccusp->coords->begin();
4322: thrust::advance(pmid, Acsr->num_entries);
4323: PetscCall(PetscLogGpuTimeBegin());
4324: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4325: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4326: thrust::for_each(zibait, zieait, VecHIPEquals());
4327: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4328: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4329: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4330: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4331: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4332: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4333: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4334: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4335: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4336: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4337: auto vT = CcsrT->values->begin();
4338: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4339: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4340: (*C)->transupdated = PETSC_TRUE;
4341: }
4342: PetscCall(PetscLogGpuTimeEnd());
4343: }
4344: }
4345: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4346: (*C)->assembled = PETSC_TRUE;
4347: (*C)->was_assembled = PETSC_FALSE;
4348: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4349: PetscFunctionReturn(PETSC_SUCCESS);
4350: }
4352: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4353: {
4354: bool dmem;
4355: const PetscScalar *av;
4357: PetscFunctionBegin;
4358: dmem = isHipMem(v);
4359: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4360: if (n && idx) {
4361: THRUSTINTARRAY widx(n);
4362: widx.assign(idx, idx + n);
4363: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4365: THRUSTARRAY *w = NULL;
4366: thrust::device_ptr<PetscScalar> dv;
4367: if (dmem) dv = thrust::device_pointer_cast(v);
4368: else {
4369: w = new THRUSTARRAY(n);
4370: dv = w->data();
4371: }
4372: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4374: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4375: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4376: thrust::for_each(zibit, zieit, VecHIPEquals());
4377: if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4378: delete w;
4379: } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4381: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4382: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4383: PetscFunctionReturn(PETSC_SUCCESS);
4384: }