Actual source code: aijhipsparse.hip.cxx
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #include <thrust/iterator/transform_iterator.h>
17: #if PETSC_CPP_VERSION >= 14
18: #define PETSC_HAVE_THRUST_ASYNC 1
19: #include <thrust/async/for_each.h>
20: #endif
21: #include <thrust/iterator/constant_iterator.h>
22: #include <thrust/iterator/discard_iterator.h>
23: #include <thrust/binary_search.h>
24: #include <thrust/remove.h>
25: #include <thrust/sort.h>
26: #include <thrust/unique.h>
28: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
33: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
44: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
48: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
49: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
50: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
52: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
53: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
54: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
56: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
57: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
58: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
59: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
60: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
61: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
62: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
63: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
64: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
65: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
67: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
68: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
70: /*
71: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
72: {
73: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
75: PetscFunctionBegin;
76: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
77: hipsparsestruct->stream = stream;
78: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
79: PetscFunctionReturn(PETSC_SUCCESS);
80: }
82: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
83: {
84: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
86: PetscFunctionBegin;
87: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
88: if (hipsparsestruct->handle != handle) {
89: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
90: hipsparsestruct->handle = handle;
91: }
92: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
93: PetscFunctionReturn(PETSC_SUCCESS);
94: }
96: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
97: {
98: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
99: PetscBool flg;
101: PetscFunctionBegin;
102: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
103: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
104: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
107: */
109: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
110: {
111: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
113: PetscFunctionBegin;
114: switch (op) {
115: case MAT_HIPSPARSE_MULT:
116: hipsparsestruct->format = format;
117: break;
118: case MAT_HIPSPARSE_ALL:
119: hipsparsestruct->format = format;
120: break;
121: default:
122: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
123: }
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: /*@
128: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
129: operation. Only the `MatMult()` operation can use different GPU storage formats
131: Not Collective
133: Input Parameters:
134: + A - Matrix of type `MATSEQAIJHIPSPARSE`
135: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
136: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
137: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
139: Level: intermediate
141: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
144: {
145: PetscFunctionBegin;
147: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
148: PetscFunctionReturn(PETSC_SUCCESS);
149: }
151: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
152: {
153: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
155: PetscFunctionBegin;
156: hipsparsestruct->use_cpu_solve = use_cpu;
157: PetscFunctionReturn(PETSC_SUCCESS);
158: }
160: /*@
161: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
163: Input Parameters:
164: + A - Matrix of type `MATSEQAIJHIPSPARSE`
165: - use_cpu - set flag for using the built-in CPU `MatSolve()`
167: Level: intermediate
169: Notes:
170: The hipSparse LU solver currently computes the factors with the built-in CPU method
171: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
172: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
174: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
175: @*/
176: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
177: {
178: PetscFunctionBegin;
180: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
181: PetscFunctionReturn(PETSC_SUCCESS);
182: }
184: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
185: {
186: PetscFunctionBegin;
187: switch (op) {
188: case MAT_FORM_EXPLICIT_TRANSPOSE:
189: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
190: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
191: A->form_explicit_transpose = flg;
192: break;
193: default:
194: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
195: break;
196: }
197: PetscFunctionReturn(PETSC_SUCCESS);
198: }
200: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
201: {
202: PetscBool row_identity, col_identity;
203: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
204: IS isrow = b->row, iscol = b->col;
205: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
207: PetscFunctionBegin;
208: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
209: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
210: B->offloadmask = PETSC_OFFLOAD_CPU;
211: /* determine which version of MatSolve needs to be used. */
212: PetscCall(ISIdentity(isrow, &row_identity));
213: PetscCall(ISIdentity(iscol, &col_identity));
214: if (!hipsparsestruct->use_cpu_solve) {
215: if (row_identity && col_identity) {
216: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
217: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
218: } else {
219: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
220: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
221: }
222: }
223: B->ops->matsolve = NULL;
224: B->ops->matsolvetranspose = NULL;
226: /* get the triangular factors */
227: if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
228: PetscFunctionReturn(PETSC_SUCCESS);
229: }
231: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
232: {
233: MatHIPSPARSEStorageFormat format;
234: PetscBool flg;
235: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
237: PetscFunctionBegin;
238: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
239: if (A->factortype == MAT_FACTOR_NONE) {
240: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
241: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
242: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
243: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
244: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
245: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
246: PetscCall(
247: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
248: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
249: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250: PetscCall(
251: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
252: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253: /*
254: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
255: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
256: */
257: }
258: PetscOptionsHeadEnd();
259: PetscFunctionReturn(PETSC_SUCCESS);
260: }
262: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
263: {
264: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
265: PetscInt n = A->rmap->n;
266: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
267: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
268: const PetscInt *ai = a->i, *aj = a->j, *vi;
269: const MatScalar *aa = a->a, *v;
270: PetscInt *AiLo, *AjLo;
271: PetscInt i, nz, nzLower, offset, rowOffset;
273: PetscFunctionBegin;
274: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
275: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
276: try {
277: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278: nzLower = n + ai[n] - ai[1];
279: if (!loTriFactor) {
280: PetscScalar *AALo;
281: PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
283: /* Allocate Space for the lower triangular matrix */
284: PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
285: PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
287: /* Fill the lower triangular matrix */
288: AiLo[0] = (PetscInt)0;
289: AiLo[n] = nzLower;
290: AjLo[0] = (PetscInt)0;
291: AALo[0] = (MatScalar)1.0;
292: v = aa;
293: vi = aj;
294: offset = 1;
295: rowOffset = 1;
296: for (i = 1; i < n; i++) {
297: nz = ai[i + 1] - ai[i];
298: /* additional 1 for the term on the diagonal */
299: AiLo[i] = rowOffset;
300: rowOffset += nz + 1;
302: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
303: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
304: offset += nz;
305: AjLo[offset] = (PetscInt)i;
306: AALo[offset] = (MatScalar)1.0;
307: offset += 1;
308: v += nz;
309: vi += nz;
310: }
312: /* allocate space for the triangular factor information */
313: PetscCall(PetscNew(&loTriFactor));
314: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
315: /* Create the matrix description */
316: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
317: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
318: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
319: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
320: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
322: /* set the operation */
323: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
325: /* set the matrix */
326: loTriFactor->csrMat = new CsrMatrix;
327: loTriFactor->csrMat->num_rows = n;
328: loTriFactor->csrMat->num_cols = n;
329: loTriFactor->csrMat->num_entries = nzLower;
330: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
331: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
334: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
335: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
336: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
338: /* Create the solve analysis information */
339: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
340: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
341: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
342: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
343: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
345: /* perform the solve analysis */
346: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
347: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
349: PetscCallHIP(WaitForHIP());
350: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
352: /* assign the pointer */
353: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
354: loTriFactor->AA_h = AALo;
355: PetscCallHIP(hipHostFree(AiLo));
356: PetscCallHIP(hipHostFree(AjLo));
357: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
358: } else { /* update values only */
359: if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
360: /* Fill the lower triangular matrix */
361: loTriFactor->AA_h[0] = 1.0;
362: v = aa;
363: vi = aj;
364: offset = 1;
365: for (i = 1; i < n; i++) {
366: nz = ai[i + 1] - ai[i];
367: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
368: offset += nz;
369: loTriFactor->AA_h[offset] = 1.0;
370: offset += 1;
371: v += nz;
372: }
373: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
374: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
375: }
376: } catch (char *ex) {
377: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
378: }
379: }
380: PetscFunctionReturn(PETSC_SUCCESS);
381: }
383: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
384: {
385: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
386: PetscInt n = A->rmap->n;
387: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
388: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
389: const PetscInt *aj = a->j, *adiag, *vi;
390: const MatScalar *aa = a->a, *v;
391: PetscInt *AiUp, *AjUp;
392: PetscInt i, nz, nzUpper, offset;
394: PetscFunctionBegin;
395: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
396: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
397: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
398: try {
399: /* next, figure out the number of nonzeros in the upper triangular matrix. */
400: nzUpper = adiag[0] - adiag[n];
401: if (!upTriFactor) {
402: PetscScalar *AAUp;
403: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
405: /* Allocate Space for the upper triangular matrix */
406: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
407: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
409: /* Fill the upper triangular matrix */
410: AiUp[0] = (PetscInt)0;
411: AiUp[n] = nzUpper;
412: offset = nzUpper;
413: for (i = n - 1; i >= 0; i--) {
414: v = aa + adiag[i + 1] + 1;
415: vi = aj + adiag[i + 1] + 1;
416: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
417: offset -= (nz + 1); /* decrement the offset */
419: /* first, set the diagonal elements */
420: AjUp[offset] = (PetscInt)i;
421: AAUp[offset] = (MatScalar)1. / v[nz];
422: AiUp[i] = AiUp[i + 1] - (nz + 1);
424: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
425: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
426: }
428: /* allocate space for the triangular factor information */
429: PetscCall(PetscNew(&upTriFactor));
430: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
432: /* Create the matrix description */
433: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
434: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
435: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
436: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
437: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
439: /* set the operation */
440: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
442: /* set the matrix */
443: upTriFactor->csrMat = new CsrMatrix;
444: upTriFactor->csrMat->num_rows = n;
445: upTriFactor->csrMat->num_cols = n;
446: upTriFactor->csrMat->num_entries = nzUpper;
447: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
448: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
449: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
450: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
451: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
452: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
454: /* Create the solve analysis information */
455: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
456: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
457: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
458: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
459: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
461: /* perform the solve analysis */
462: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
463: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
465: PetscCallHIP(WaitForHIP());
466: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
468: /* assign the pointer */
469: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
470: upTriFactor->AA_h = AAUp;
471: PetscCallHIP(hipHostFree(AiUp));
472: PetscCallHIP(hipHostFree(AjUp));
473: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
474: } else {
475: if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
476: /* Fill the upper triangular matrix */
477: offset = nzUpper;
478: for (i = n - 1; i >= 0; i--) {
479: v = aa + adiag[i + 1] + 1;
480: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
481: offset -= (nz + 1); /* decrement the offset */
483: /* first, set the diagonal elements */
484: upTriFactor->AA_h[offset] = 1. / v[nz];
485: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
486: }
487: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
488: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
489: }
490: } catch (char *ex) {
491: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
492: }
493: }
494: PetscFunctionReturn(PETSC_SUCCESS);
495: }
497: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
498: {
499: PetscBool row_identity, col_identity;
500: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
501: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
502: IS isrow = a->row, iscol = a->icol;
503: PetscInt n = A->rmap->n;
505: PetscFunctionBegin;
506: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
507: PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
508: PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
510: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
511: hipsparseTriFactors->nnz = a->nz;
513: A->offloadmask = PETSC_OFFLOAD_BOTH;
514: /* lower triangular indices */
515: PetscCall(ISIdentity(isrow, &row_identity));
516: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
517: const PetscInt *r;
519: PetscCall(ISGetIndices(isrow, &r));
520: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
521: hipsparseTriFactors->rpermIndices->assign(r, r + n);
522: PetscCall(ISRestoreIndices(isrow, &r));
523: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
524: }
525: /* upper triangular indices */
526: PetscCall(ISIdentity(iscol, &col_identity));
527: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
528: const PetscInt *c;
530: PetscCall(ISGetIndices(iscol, &c));
531: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
532: hipsparseTriFactors->cpermIndices->assign(c, c + n);
533: PetscCall(ISRestoreIndices(iscol, &c));
534: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
535: }
536: PetscFunctionReturn(PETSC_SUCCESS);
537: }
539: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
540: {
541: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
542: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
543: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
544: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
545: PetscInt *AiUp, *AjUp;
546: PetscScalar *AAUp;
547: PetscScalar *AALo;
548: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
549: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
550: const PetscInt *ai = b->i, *aj = b->j, *vj;
551: const MatScalar *aa = b->a, *v;
553: PetscFunctionBegin;
554: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
555: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
556: try {
557: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
558: PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
559: if (!upTriFactor && !loTriFactor) {
560: /* Allocate Space for the upper triangular matrix */
561: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
562: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
564: /* Fill the upper triangular matrix */
565: AiUp[0] = (PetscInt)0;
566: AiUp[n] = nzUpper;
567: offset = 0;
568: for (i = 0; i < n; i++) {
569: /* set the pointers */
570: v = aa + ai[i];
571: vj = aj + ai[i];
572: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
574: /* first, set the diagonal elements */
575: AjUp[offset] = (PetscInt)i;
576: AAUp[offset] = (MatScalar)1.0 / v[nz];
577: AiUp[i] = offset;
578: AALo[offset] = (MatScalar)1.0 / v[nz];
580: offset += 1;
581: if (nz > 0) {
582: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
583: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
584: for (j = offset; j < offset + nz; j++) {
585: AAUp[j] = -AAUp[j];
586: AALo[j] = AAUp[j] / v[nz];
587: }
588: offset += nz;
589: }
590: }
592: /* allocate space for the triangular factor information */
593: PetscCall(PetscNew(&upTriFactor));
594: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
596: /* Create the matrix description */
597: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
598: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
599: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
600: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
601: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
603: /* set the matrix */
604: upTriFactor->csrMat = new CsrMatrix;
605: upTriFactor->csrMat->num_rows = A->rmap->n;
606: upTriFactor->csrMat->num_cols = A->cmap->n;
607: upTriFactor->csrMat->num_entries = a->nz;
608: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
609: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
610: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
611: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
612: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
613: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
615: /* set the operation */
616: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
618: /* Create the solve analysis information */
619: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
620: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
621: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
622: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
623: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
625: /* perform the solve analysis */
626: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
627: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
629: PetscCallHIP(WaitForHIP());
630: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
632: /* assign the pointer */
633: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
635: /* allocate space for the triangular factor information */
636: PetscCall(PetscNew(&loTriFactor));
637: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
639: /* Create the matrix description */
640: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
641: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
642: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
643: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
644: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
646: /* set the operation */
647: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
649: /* set the matrix */
650: loTriFactor->csrMat = new CsrMatrix;
651: loTriFactor->csrMat->num_rows = A->rmap->n;
652: loTriFactor->csrMat->num_cols = A->cmap->n;
653: loTriFactor->csrMat->num_entries = a->nz;
654: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
655: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
656: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
657: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
658: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
659: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
661: /* Create the solve analysis information */
662: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
663: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
664: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
665: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
666: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
668: /* perform the solve analysis */
669: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
670: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
672: PetscCallHIP(WaitForHIP());
673: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
675: /* assign the pointer */
676: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
678: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
679: PetscCallHIP(hipHostFree(AiUp));
680: PetscCallHIP(hipHostFree(AjUp));
681: } else {
682: /* Fill the upper triangular matrix */
683: offset = 0;
684: for (i = 0; i < n; i++) {
685: /* set the pointers */
686: v = aa + ai[i];
687: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
689: /* first, set the diagonal elements */
690: AAUp[offset] = 1.0 / v[nz];
691: AALo[offset] = 1.0 / v[nz];
693: offset += 1;
694: if (nz > 0) {
695: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
696: for (j = offset; j < offset + nz; j++) {
697: AAUp[j] = -AAUp[j];
698: AALo[j] = AAUp[j] / v[nz];
699: }
700: offset += nz;
701: }
702: }
703: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
704: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
705: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
706: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
707: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
708: }
709: PetscCallHIP(hipHostFree(AAUp));
710: PetscCallHIP(hipHostFree(AALo));
711: } catch (char *ex) {
712: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
713: }
714: }
715: PetscFunctionReturn(PETSC_SUCCESS);
716: }
718: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
719: {
720: PetscBool perm_identity;
721: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
722: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
723: IS ip = a->row;
724: PetscInt n = A->rmap->n;
726: PetscFunctionBegin;
727: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
728: PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
729: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
730: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
732: A->offloadmask = PETSC_OFFLOAD_BOTH;
733: /* lower triangular indices */
734: PetscCall(ISIdentity(ip, &perm_identity));
735: if (!perm_identity) {
736: IS iip;
737: const PetscInt *irip, *rip;
739: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
740: PetscCall(ISGetIndices(iip, &irip));
741: PetscCall(ISGetIndices(ip, &rip));
742: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
743: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
744: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
745: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
746: PetscCall(ISRestoreIndices(iip, &irip));
747: PetscCall(ISDestroy(&iip));
748: PetscCall(ISRestoreIndices(ip, &rip));
749: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
750: }
751: PetscFunctionReturn(PETSC_SUCCESS);
752: }
754: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
755: {
756: PetscBool perm_identity;
757: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
758: IS ip = b->row;
760: PetscFunctionBegin;
761: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
762: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
763: B->offloadmask = PETSC_OFFLOAD_CPU;
764: /* determine which version of MatSolve needs to be used. */
765: PetscCall(ISIdentity(ip, &perm_identity));
766: if (perm_identity) {
767: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
768: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
769: B->ops->matsolve = NULL;
770: B->ops->matsolvetranspose = NULL;
771: } else {
772: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
773: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
774: B->ops->matsolve = NULL;
775: B->ops->matsolvetranspose = NULL;
776: }
778: /* get the triangular factors */
779: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
780: PetscFunctionReturn(PETSC_SUCCESS);
781: }
783: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
784: {
785: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
786: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
787: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
788: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
789: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
790: hipsparseIndexBase_t indexBase;
791: hipsparseMatrixType_t matrixType;
792: hipsparseFillMode_t fillMode;
793: hipsparseDiagType_t diagType;
795: PetscFunctionBegin;
796: /* allocate space for the transpose of the lower triangular factor */
797: PetscCall(PetscNew(&loTriFactorT));
798: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
800: /* set the matrix descriptors of the lower triangular factor */
801: matrixType = hipsparseGetMatType(loTriFactor->descr);
802: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
803: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
804: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
806: /* Create the matrix description */
807: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
808: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
809: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
810: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
811: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
813: /* set the operation */
814: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
816: /* allocate GPU space for the CSC of the lower triangular factor*/
817: loTriFactorT->csrMat = new CsrMatrix;
818: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
819: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
820: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
821: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
822: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
823: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
825: /* compute the transpose of the lower triangular factor, i.e. the CSC */
826: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
827: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
828: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
829: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
830: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
831: PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
832: #endif
833: */
834: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
836: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
837: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
838: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
839: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
840: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
841: #else
842: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
843: #endif
845: PetscCallHIP(WaitForHIP());
846: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
848: /* Create the solve analysis information */
849: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
850: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
851: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
852: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
853: PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
855: /* perform the solve analysis */
856: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
857: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
859: PetscCallHIP(WaitForHIP());
860: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
862: /* assign the pointer */
863: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
865: /*********************************************/
866: /* Now the Transpose of the Upper Tri Factor */
867: /*********************************************/
869: /* allocate space for the transpose of the upper triangular factor */
870: PetscCall(PetscNew(&upTriFactorT));
871: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
873: /* set the matrix descriptors of the upper triangular factor */
874: matrixType = hipsparseGetMatType(upTriFactor->descr);
875: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
876: fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
877: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
879: /* Create the matrix description */
880: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
881: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
882: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
883: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
884: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
886: /* set the operation */
887: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
889: /* allocate GPU space for the CSC of the upper triangular factor*/
890: upTriFactorT->csrMat = new CsrMatrix;
891: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
892: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
893: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
894: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
895: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
896: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
898: /* compute the transpose of the upper triangular factor, i.e. the CSC */
899: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
900: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
901: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
902: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
903: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
904: PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
905: #endif
906: */
907: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
908: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
909: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
910: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
911: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
912: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
913: #else
914: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
915: #endif
917: PetscCallHIP(WaitForHIP());
918: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
920: /* Create the solve analysis information */
921: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
922: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
923: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
924: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
925: PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
927: /* perform the solve analysis */
928: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
929: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
931: PetscCallHIP(WaitForHIP());
932: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
934: /* assign the pointer */
935: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
936: PetscFunctionReturn(PETSC_SUCCESS);
937: }
939: struct PetscScalarToPetscInt {
940: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
941: };
943: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
944: {
945: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
946: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
947: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
948: hipsparseIndexBase_t indexBase;
950: PetscFunctionBegin;
951: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
952: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
953: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
954: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
955: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
956: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
957: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
958: PetscCall(PetscLogGpuTimeBegin());
959: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
960: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
961: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
962: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
963: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
964: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
965: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
967: /* set alpha and beta */
968: PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
969: PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
970: PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
971: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
972: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
973: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
975: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
976: CsrMatrix *matrixT = new CsrMatrix;
977: matstructT->mat = matrixT;
978: matrixT->num_rows = A->cmap->n;
979: matrixT->num_cols = A->rmap->n;
980: matrixT->num_entries = a->nz;
981: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
982: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
983: matrixT->values = new THRUSTARRAY(a->nz);
985: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
986: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
988: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
989: indexBase, hipsparse_scalartype));
990: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
991: CsrMatrix *temp = new CsrMatrix;
992: CsrMatrix *tempT = new CsrMatrix;
993: /* First convert HYB to CSR */
994: temp->num_rows = A->rmap->n;
995: temp->num_cols = A->cmap->n;
996: temp->num_entries = a->nz;
997: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
998: temp->column_indices = new THRUSTINTARRAY32(a->nz);
999: temp->values = new THRUSTARRAY(a->nz);
1001: PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1003: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1004: tempT->num_rows = A->rmap->n;
1005: tempT->num_cols = A->cmap->n;
1006: tempT->num_entries = a->nz;
1007: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1008: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1009: tempT->values = new THRUSTARRAY(a->nz);
1011: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1012: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1014: /* Last, convert CSC to HYB */
1015: hipsparseHybMat_t hybMat;
1016: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1017: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1018: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1020: /* assign the pointer */
1021: matstructT->mat = hybMat;
1022: A->transupdated = PETSC_TRUE;
1023: /* delete temporaries */
1024: if (tempT) {
1025: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1026: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1027: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1028: delete (CsrMatrix *)tempT;
1029: }
1030: if (temp) {
1031: if (temp->values) delete (THRUSTARRAY *)temp->values;
1032: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1033: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1034: delete (CsrMatrix *)temp;
1035: }
1036: }
1037: }
1038: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1039: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1040: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1041: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1042: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1043: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1044: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1045: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1046: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1047: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1048: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1049: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1050: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1051: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1052: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1053: }
1054: if (!hipsparsestruct->csr2csc_i) {
1055: THRUSTARRAY csr2csc_a(matrix->num_entries);
1056: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1058: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1059: if (matrix->num_entries) {
1060: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1061: Need to verify this for ROCm.
1062: */
1063: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1064: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1065: } else {
1066: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1067: }
1069: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1070: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1071: }
1072: PetscCallThrust(
1073: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1074: }
1075: PetscCall(PetscLogGpuTimeEnd());
1076: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1077: /* the compressed row indices is not used for matTranspose */
1078: matstructT->cprowIndices = NULL;
1079: /* assign the pointer */
1080: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1081: A->transupdated = PETSC_TRUE;
1082: PetscFunctionReturn(PETSC_SUCCESS);
1083: }
1085: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1086: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1087: {
1088: PetscInt n = xx->map->n;
1089: const PetscScalar *barray;
1090: PetscScalar *xarray;
1091: thrust::device_ptr<const PetscScalar> bGPU;
1092: thrust::device_ptr<PetscScalar> xGPU;
1093: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1094: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1095: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1096: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1098: PetscFunctionBegin;
1099: /* Analyze the matrix and create the transpose ... on the fly */
1100: if (!loTriFactorT && !upTriFactorT) {
1101: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1102: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1103: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1104: }
1106: /* Get the GPU pointers */
1107: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1108: PetscCall(VecHIPGetArrayRead(bb, &barray));
1109: xGPU = thrust::device_pointer_cast(xarray);
1110: bGPU = thrust::device_pointer_cast(barray);
1112: PetscCall(PetscLogGpuTimeBegin());
1113: /* First, reorder with the row permutation */
1114: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1116: /* First, solve U */
1117: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1118: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1120: /* Then, solve L */
1121: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1122: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1124: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1125: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1127: /* Copy the temporary to the full solution. */
1128: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1130: /* restore */
1131: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1132: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1133: PetscCall(PetscLogGpuTimeEnd());
1134: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1135: PetscFunctionReturn(PETSC_SUCCESS);
1136: }
1138: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1139: {
1140: const PetscScalar *barray;
1141: PetscScalar *xarray;
1142: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1143: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1144: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1145: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1147: PetscFunctionBegin;
1148: /* Analyze the matrix and create the transpose ... on the fly */
1149: if (!loTriFactorT && !upTriFactorT) {
1150: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1151: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1152: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1153: }
1155: /* Get the GPU pointers */
1156: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1157: PetscCall(VecHIPGetArrayRead(bb, &barray));
1159: PetscCall(PetscLogGpuTimeBegin());
1160: /* First, solve U */
1161: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1162: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1164: /* Then, solve L */
1165: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1166: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1168: /* restore */
1169: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1170: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1171: PetscCall(PetscLogGpuTimeEnd());
1172: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1173: PetscFunctionReturn(PETSC_SUCCESS);
1174: }
1176: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1177: {
1178: const PetscScalar *barray;
1179: PetscScalar *xarray;
1180: thrust::device_ptr<const PetscScalar> bGPU;
1181: thrust::device_ptr<PetscScalar> xGPU;
1182: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1183: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1184: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1185: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1187: PetscFunctionBegin;
1188: /* Get the GPU pointers */
1189: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1190: PetscCall(VecHIPGetArrayRead(bb, &barray));
1191: xGPU = thrust::device_pointer_cast(xarray);
1192: bGPU = thrust::device_pointer_cast(barray);
1194: PetscCall(PetscLogGpuTimeBegin());
1195: /* First, reorder with the row permutation */
1196: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1198: /* Next, solve L */
1199: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1200: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1202: /* Then, solve U */
1203: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1204: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1206: /* Last, reorder with the column permutation */
1207: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1209: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1210: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1211: PetscCall(PetscLogGpuTimeEnd());
1212: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1213: PetscFunctionReturn(PETSC_SUCCESS);
1214: }
1216: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1217: {
1218: const PetscScalar *barray;
1219: PetscScalar *xarray;
1220: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1221: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1222: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1223: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1225: PetscFunctionBegin;
1226: /* Get the GPU pointers */
1227: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1228: PetscCall(VecHIPGetArrayRead(bb, &barray));
1230: PetscCall(PetscLogGpuTimeBegin());
1231: /* First, solve L */
1232: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1233: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1235: /* Next, solve U */
1236: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1237: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1239: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1240: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1241: PetscCall(PetscLogGpuTimeEnd());
1242: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1243: PetscFunctionReturn(PETSC_SUCCESS);
1244: }
1246: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1247: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1248: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1249: {
1250: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1251: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1252: const PetscScalar *barray;
1253: PetscScalar *xarray;
1255: PetscFunctionBegin;
1256: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1257: PetscCall(VecHIPGetArrayRead(b, &barray));
1258: PetscCall(PetscLogGpuTimeBegin());
1260: /* Solve L*y = b */
1261: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1262: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1263: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1264: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1265: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1266: #else
1267: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1268: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1269: #endif
1270: /* Solve U*x = y */
1271: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1272: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1273: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1274: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1275: #else
1276: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1277: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1278: #endif
1279: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1280: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1282: PetscCall(PetscLogGpuTimeEnd());
1283: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1284: PetscFunctionReturn(PETSC_SUCCESS);
1285: }
1287: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1288: {
1289: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1290: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1291: const PetscScalar *barray;
1292: PetscScalar *xarray;
1294: PetscFunctionBegin;
1295: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1296: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1297: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1298: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1300: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1301: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1302: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1303: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1304: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1305: }
1307: if (!fs->updatedTransposeSpSVAnalysis) {
1308: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1310: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1311: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1312: }
1314: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1315: PetscCall(VecHIPGetArrayRead(b, &barray));
1316: PetscCall(PetscLogGpuTimeBegin());
1318: /* Solve Ut*y = b */
1319: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1320: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1321: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1322: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1323: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1324: #else
1325: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1326: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1327: #endif
1328: /* Solve Lt*x = y */
1329: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1330: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1331: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1332: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1333: #else
1334: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1335: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1336: #endif
1337: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1338: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1339: PetscCall(PetscLogGpuTimeEnd());
1340: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1341: PetscFunctionReturn(PETSC_SUCCESS);
1342: }
1344: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1345: {
1346: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1347: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1348: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1349: CsrMatrix *Acsr;
1350: PetscInt m, nz;
1351: PetscBool flg;
1353: PetscFunctionBegin;
1354: if (PetscDefined(USE_DEBUG)) {
1355: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1356: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1357: }
1359: /* Copy A's value to fact */
1360: m = fact->rmap->n;
1361: nz = aij->nz;
1362: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1363: Acsr = (CsrMatrix *)Acusp->mat->mat;
1364: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1366: /* Factorize fact inplace */
1367: if (m)
1368: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1369: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1370: if (PetscDefined(USE_DEBUG)) {
1371: int numerical_zero;
1372: hipsparseStatus_t status;
1373: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1374: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1375: }
1377: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1378: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1380: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1382: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1383: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1385: fact->offloadmask = PETSC_OFFLOAD_GPU;
1386: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1387: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1388: fact->ops->matsolve = NULL;
1389: fact->ops->matsolvetranspose = NULL;
1390: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1391: PetscFunctionReturn(PETSC_SUCCESS);
1392: }
1394: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1395: {
1396: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1397: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1398: PetscInt m, nz;
1400: PetscFunctionBegin;
1401: if (PetscDefined(USE_DEBUG)) {
1402: PetscBool flg, diagDense;
1404: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1405: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1406: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1407: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1408: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1409: }
1411: /* Free the old stale stuff */
1412: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1414: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1415: but they will not be used. Allocate them just for easy debugging.
1416: */
1417: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1419: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1420: fact->factortype = MAT_FACTOR_ILU;
1421: fact->info.factor_mallocs = 0;
1422: fact->info.fill_ratio_given = info->fill;
1423: fact->info.fill_ratio_needed = 1.0;
1425: aij->row = NULL;
1426: aij->col = NULL;
1428: /* ====================================================================== */
1429: /* Copy A's i, j to fact and also allocate the value array of fact. */
1430: /* We'll do in-place factorization on fact */
1431: /* ====================================================================== */
1432: const int *Ai, *Aj;
1434: m = fact->rmap->n;
1435: nz = aij->nz;
1437: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1438: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1439: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1440: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1441: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1442: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1444: /* ====================================================================== */
1445: /* Create descriptors for M, L, U */
1446: /* ====================================================================== */
1447: hipsparseFillMode_t fillMode;
1448: hipsparseDiagType_t diagType;
1450: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1451: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1452: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1454: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1455: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1456: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1457: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1458: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1459: */
1460: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1461: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1462: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1463: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1464: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1466: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1467: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1468: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1469: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1470: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1472: /* ========================================================================= */
1473: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1474: /* ========================================================================= */
1475: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1476: if (m)
1477: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1478: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1480: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1481: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1483: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1484: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1486: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1487: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1489: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1490: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1492: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1493: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1494: */
1495: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1496: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1497: fs->spsvBuffer_L = fs->factBuffer_M;
1498: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1499: } else {
1500: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1501: fs->spsvBuffer_U = fs->factBuffer_M;
1502: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1503: }
1505: /* ========================================================================== */
1506: /* Perform analysis of ilu0 on M, SpSv on L and U */
1507: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1508: /* ========================================================================== */
1509: int structural_zero;
1511: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1512: if (m)
1513: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1514: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1515: if (PetscDefined(USE_DEBUG)) {
1516: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1517: hipsparseStatus_t status;
1518: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1519: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1520: }
1522: /* Estimate FLOPs of the numeric factorization */
1523: {
1524: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1525: PetscInt *Ai, nzRow, nzLeft;
1526: PetscLogDouble flops = 0.0;
1527: const PetscInt *Adiag;
1529: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &Adiag, NULL));
1530: Ai = Aseq->i;
1531: for (PetscInt i = 0; i < m; i++) {
1532: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1533: nzRow = Ai[i + 1] - Ai[i];
1534: nzLeft = Adiag[i] - Ai[i];
1535: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1536: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1537: */
1538: nzLeft = (nzRow - 1) / 2;
1539: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1540: }
1541: }
1542: fs->numericFactFlops = flops;
1543: }
1544: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1545: PetscFunctionReturn(PETSC_SUCCESS);
1546: }
1548: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1549: {
1550: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1551: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1552: const PetscScalar *barray;
1553: PetscScalar *xarray;
1555: PetscFunctionBegin;
1556: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1557: PetscCall(VecHIPGetArrayRead(b, &barray));
1558: PetscCall(PetscLogGpuTimeBegin());
1560: /* Solve L*y = b */
1561: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1562: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1563: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1564: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1565: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1566: #else
1567: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1568: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1569: #endif
1570: /* Solve Lt*x = y */
1571: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1572: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1573: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1574: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1575: #else
1576: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1577: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1578: #endif
1579: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1580: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1582: PetscCall(PetscLogGpuTimeEnd());
1583: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1584: PetscFunctionReturn(PETSC_SUCCESS);
1585: }
1587: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1588: {
1589: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1590: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1591: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1592: CsrMatrix *Acsr;
1593: PetscInt m, nz;
1594: PetscBool flg;
1596: PetscFunctionBegin;
1597: if (PetscDefined(USE_DEBUG)) {
1598: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1599: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1600: }
1602: /* Copy A's value to fact */
1603: m = fact->rmap->n;
1604: nz = aij->nz;
1605: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1606: Acsr = (CsrMatrix *)Acusp->mat->mat;
1607: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1609: /* Factorize fact inplace */
1610: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1611: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1612: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1613: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1614: */
1615: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1616: if (PetscDefined(USE_DEBUG)) {
1617: int numerical_zero;
1618: hipsparseStatus_t status;
1619: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1620: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1621: }
1623: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1625: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1626: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1627: */
1628: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1630: fact->offloadmask = PETSC_OFFLOAD_GPU;
1631: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1633: fact->ops->matsolve = NULL;
1634: fact->ops->matsolvetranspose = NULL;
1635: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1636: PetscFunctionReturn(PETSC_SUCCESS);
1637: }
1639: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1640: {
1641: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1642: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1643: PetscInt m, nz;
1645: PetscFunctionBegin;
1646: if (PetscDefined(USE_DEBUG)) {
1647: PetscBool flg, diagDense;
1649: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1650: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1651: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1652: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1653: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1654: }
1656: /* Free the old stale stuff */
1657: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1659: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1660: but they will not be used. Allocate them just for easy debugging.
1661: */
1662: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1664: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1665: fact->factortype = MAT_FACTOR_ICC;
1666: fact->info.factor_mallocs = 0;
1667: fact->info.fill_ratio_given = info->fill;
1668: fact->info.fill_ratio_needed = 1.0;
1670: aij->row = NULL;
1671: aij->col = NULL;
1673: /* ====================================================================== */
1674: /* Copy A's i, j to fact and also allocate the value array of fact. */
1675: /* We'll do in-place factorization on fact */
1676: /* ====================================================================== */
1677: const int *Ai, *Aj;
1679: m = fact->rmap->n;
1680: nz = aij->nz;
1682: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1683: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1684: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1685: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1686: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1689: /* ====================================================================== */
1690: /* Create mat descriptors for M, L */
1691: /* ====================================================================== */
1692: hipsparseFillMode_t fillMode;
1693: hipsparseDiagType_t diagType;
1695: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1696: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1697: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1699: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1700: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1701: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1702: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1703: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1704: */
1705: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1706: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1707: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1708: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1709: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1711: /* ========================================================================= */
1712: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1713: /* ========================================================================= */
1714: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1715: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1717: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1718: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1720: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1721: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1723: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1724: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1726: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1727: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1729: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1730: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1731: */
1732: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1733: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1734: fs->spsvBuffer_L = fs->factBuffer_M;
1735: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1736: } else {
1737: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1738: fs->spsvBuffer_Lt = fs->factBuffer_M;
1739: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1740: }
1742: /* ========================================================================== */
1743: /* Perform analysis of ic0 on M */
1744: /* The lower triangular part of M has the same sparsity pattern as L */
1745: /* ========================================================================== */
1746: int structural_zero;
1748: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1749: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1750: if (PetscDefined(USE_DEBUG)) {
1751: hipsparseStatus_t status;
1752: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1753: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1754: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1755: }
1757: /* Estimate FLOPs of the numeric factorization */
1758: {
1759: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1760: PetscInt *Ai, nzRow, nzLeft;
1761: PetscLogDouble flops = 0.0;
1763: Ai = Aseq->i;
1764: for (PetscInt i = 0; i < m; i++) {
1765: nzRow = Ai[i + 1] - Ai[i];
1766: if (nzRow > 1) {
1767: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1768: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1769: */
1770: nzLeft = (nzRow - 1) / 2;
1771: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1772: }
1773: }
1774: fs->numericFactFlops = flops;
1775: }
1776: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1777: PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1779: #endif
1781: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1782: {
1783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1785: PetscFunctionBegin;
1786: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1787: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1788: if (!info->factoronhost) {
1789: PetscCall(ISIdentity(isrow, &row_identity));
1790: PetscCall(ISIdentity(iscol, &col_identity));
1791: }
1792: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1793: else
1794: #endif
1795: {
1796: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1797: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1798: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1799: }
1800: PetscFunctionReturn(PETSC_SUCCESS);
1801: }
1803: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1804: {
1805: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1807: PetscFunctionBegin;
1808: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1809: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1810: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1811: PetscFunctionReturn(PETSC_SUCCESS);
1812: }
1814: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1815: {
1816: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1818: PetscFunctionBegin;
1819: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1820: PetscBool perm_identity = PETSC_FALSE;
1821: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1822: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1823: else
1824: #endif
1825: {
1826: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1827: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1828: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1829: }
1830: PetscFunctionReturn(PETSC_SUCCESS);
1831: }
1833: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1834: {
1835: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1837: PetscFunctionBegin;
1838: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1839: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1840: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1841: PetscFunctionReturn(PETSC_SUCCESS);
1842: }
1844: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1845: {
1846: PetscFunctionBegin;
1847: *type = MATSOLVERHIPSPARSE;
1848: PetscFunctionReturn(PETSC_SUCCESS);
1849: }
1851: /*MC
1852: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1853: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1854: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1855: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1856: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1857: algorithms are not recommended. This class does NOT support direct solver operations.
1859: Level: beginner
1861: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1862: M*/
1864: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1865: {
1866: PetscInt n = A->rmap->n;
1868: PetscFunctionBegin;
1869: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1870: PetscCall(MatSetSizes(*B, n, n, n, n));
1871: (*B)->factortype = ftype;
1872: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1874: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1875: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1876: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1877: if (!A->boundtocpu) {
1878: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1879: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1880: } else {
1881: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1882: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1883: }
1884: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1885: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1886: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1887: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1888: if (!A->boundtocpu) {
1889: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1890: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1891: } else {
1892: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1893: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1894: }
1895: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1896: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1897: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1899: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1900: (*B)->canuseordering = PETSC_TRUE;
1901: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1902: PetscFunctionReturn(PETSC_SUCCESS);
1903: }
1905: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1906: {
1907: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1908: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1909: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1910: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1911: #endif
1913: PetscFunctionBegin;
1914: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1915: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1916: if (A->factortype == MAT_FACTOR_NONE) {
1917: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1918: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1919: }
1920: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1921: else if (fs->csrVal) {
1922: /* We have a factorized matrix on device and are able to copy it to host */
1923: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1924: }
1925: #endif
1926: else
1927: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1928: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1929: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1930: A->offloadmask = PETSC_OFFLOAD_BOTH;
1931: }
1932: PetscFunctionReturn(PETSC_SUCCESS);
1933: }
1935: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1936: {
1937: PetscFunctionBegin;
1938: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1939: *array = ((Mat_SeqAIJ *)A->data)->a;
1940: PetscFunctionReturn(PETSC_SUCCESS);
1941: }
1943: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1944: {
1945: PetscFunctionBegin;
1946: A->offloadmask = PETSC_OFFLOAD_CPU;
1947: *array = NULL;
1948: PetscFunctionReturn(PETSC_SUCCESS);
1949: }
1951: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1952: {
1953: PetscFunctionBegin;
1954: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1955: *array = ((Mat_SeqAIJ *)A->data)->a;
1956: PetscFunctionReturn(PETSC_SUCCESS);
1957: }
1959: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1960: {
1961: PetscFunctionBegin;
1962: *array = NULL;
1963: PetscFunctionReturn(PETSC_SUCCESS);
1964: }
1966: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1967: {
1968: PetscFunctionBegin;
1969: *array = ((Mat_SeqAIJ *)A->data)->a;
1970: PetscFunctionReturn(PETSC_SUCCESS);
1971: }
1973: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1974: {
1975: PetscFunctionBegin;
1976: A->offloadmask = PETSC_OFFLOAD_CPU;
1977: *array = NULL;
1978: PetscFunctionReturn(PETSC_SUCCESS);
1979: }
1981: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1982: {
1983: Mat_SeqAIJHIPSPARSE *cusp;
1984: CsrMatrix *matrix;
1986: PetscFunctionBegin;
1987: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1988: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1989: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1990: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1991: matrix = (CsrMatrix *)cusp->mat->mat;
1993: if (i) {
1994: #if !defined(PETSC_USE_64BIT_INDICES)
1995: *i = matrix->row_offsets->data().get();
1996: #else
1997: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
1998: #endif
1999: }
2000: if (j) {
2001: #if !defined(PETSC_USE_64BIT_INDICES)
2002: *j = matrix->column_indices->data().get();
2003: #else
2004: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2005: #endif
2006: }
2007: if (a) *a = matrix->values->data().get();
2008: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2009: PetscFunctionReturn(PETSC_SUCCESS);
2010: }
2012: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2013: {
2014: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2015: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2016: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2017: PetscBool both = PETSC_TRUE;
2018: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2020: PetscFunctionBegin;
2021: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2022: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2023: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2024: CsrMatrix *matrix;
2025: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2027: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2028: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2029: matrix->values->assign(a->a, a->a + a->nz);
2030: PetscCallHIP(WaitForHIP());
2031: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2032: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2033: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2034: } else {
2035: PetscInt nnz;
2036: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2037: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2038: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2039: delete hipsparsestruct->workVector;
2040: delete hipsparsestruct->rowoffsets_gpu;
2041: hipsparsestruct->workVector = NULL;
2042: hipsparsestruct->rowoffsets_gpu = NULL;
2043: try {
2044: if (a->compressedrow.use) {
2045: m = a->compressedrow.nrows;
2046: ii = a->compressedrow.i;
2047: ridx = a->compressedrow.rindex;
2048: } else {
2049: m = A->rmap->n;
2050: ii = a->i;
2051: ridx = NULL;
2052: }
2053: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2054: if (!a->a) {
2055: nnz = ii[m];
2056: both = PETSC_FALSE;
2057: } else nnz = a->nz;
2058: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2060: /* create hipsparse matrix */
2061: hipsparsestruct->nrows = m;
2062: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2063: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2064: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2065: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2067: PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2068: PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2069: PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2070: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2071: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2072: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2073: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2075: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2076: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2077: /* set the matrix */
2078: CsrMatrix *mat = new CsrMatrix;
2079: mat->num_rows = m;
2080: mat->num_cols = A->cmap->n;
2081: mat->num_entries = nnz;
2082: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2083: mat->column_indices = new THRUSTINTARRAY32(nnz);
2084: mat->values = new THRUSTARRAY(nnz);
2085: mat->row_offsets->assign(ii, ii + m + 1);
2086: mat->column_indices->assign(a->j, a->j + nnz);
2087: if (a->a) mat->values->assign(a->a, a->a + nnz);
2089: /* assign the pointer */
2090: matstruct->mat = mat;
2091: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2092: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2093: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2094: }
2095: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2096: CsrMatrix *mat = new CsrMatrix;
2097: mat->num_rows = m;
2098: mat->num_cols = A->cmap->n;
2099: mat->num_entries = nnz;
2100: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2101: mat->column_indices = new THRUSTINTARRAY32(nnz);
2102: mat->values = new THRUSTARRAY(nnz);
2103: mat->row_offsets->assign(ii, ii + m + 1);
2104: mat->column_indices->assign(a->j, a->j + nnz);
2105: if (a->a) mat->values->assign(a->a, a->a + nnz);
2107: hipsparseHybMat_t hybMat;
2108: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2109: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2110: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2111: /* assign the pointer */
2112: matstruct->mat = hybMat;
2114: if (mat) {
2115: if (mat->values) delete (THRUSTARRAY *)mat->values;
2116: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2117: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2118: delete (CsrMatrix *)mat;
2119: }
2120: }
2122: /* assign the compressed row indices */
2123: if (a->compressedrow.use) {
2124: hipsparsestruct->workVector = new THRUSTARRAY(m);
2125: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2126: matstruct->cprowIndices->assign(ridx, ridx + m);
2127: tmp = m;
2128: } else {
2129: hipsparsestruct->workVector = NULL;
2130: matstruct->cprowIndices = NULL;
2131: tmp = 0;
2132: }
2133: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2135: /* assign the pointer */
2136: hipsparsestruct->mat = matstruct;
2137: } catch (char *ex) {
2138: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2139: }
2140: PetscCallHIP(WaitForHIP());
2141: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2142: hipsparsestruct->nonzerostate = A->nonzerostate;
2143: }
2144: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2145: }
2146: PetscFunctionReturn(PETSC_SUCCESS);
2147: }
2149: struct VecHIPPlusEquals {
2150: template <typename Tuple>
2151: __host__ __device__ void operator()(Tuple t)
2152: {
2153: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2154: }
2155: };
2157: struct VecHIPEquals {
2158: template <typename Tuple>
2159: __host__ __device__ void operator()(Tuple t)
2160: {
2161: thrust::get<1>(t) = thrust::get<0>(t);
2162: }
2163: };
2165: struct VecHIPEqualsReverse {
2166: template <typename Tuple>
2167: __host__ __device__ void operator()(Tuple t)
2168: {
2169: thrust::get<0>(t) = thrust::get<1>(t);
2170: }
2171: };
2173: struct MatProductCtx_MatMatHipsparse {
2174: PetscBool cisdense;
2175: PetscScalar *Bt;
2176: Mat X;
2177: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2178: PetscLogDouble flops;
2179: CsrMatrix *Bcsr;
2180: hipsparseSpMatDescr_t matSpBDescr;
2181: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2182: hipsparseDnMatDescr_t matBDescr;
2183: hipsparseDnMatDescr_t matCDescr;
2184: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2185: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2186: void *dBuffer4, *dBuffer5;
2187: #endif
2188: size_t mmBufferSize;
2189: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2190: hipsparseSpGEMMDescr_t spgemmDesc;
2191: };
2193: static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
2194: {
2195: MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
2197: PetscFunctionBegin;
2198: PetscCallHIP(hipFree(mmdata->Bt));
2199: delete mmdata->Bcsr;
2200: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2201: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2202: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2203: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2204: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2205: PetscCallHIP(hipFree(mmdata->dBuffer4));
2206: PetscCallHIP(hipFree(mmdata->dBuffer5));
2207: #endif
2208: PetscCallHIP(hipFree(mmdata->mmBuffer));
2209: PetscCallHIP(hipFree(mmdata->mmBuffer2));
2210: PetscCall(MatDestroy(&mmdata->X));
2211: PetscCall(PetscFree(*(void **)data));
2212: PetscFunctionReturn(PETSC_SUCCESS);
2213: }
2215: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2216: {
2217: Mat_Product *product = C->product;
2218: Mat A, B;
2219: PetscInt m, n, blda, clda;
2220: PetscBool flg, biship;
2221: Mat_SeqAIJHIPSPARSE *cusp;
2222: hipsparseOperation_t opA;
2223: const PetscScalar *barray;
2224: PetscScalar *carray;
2225: MatProductCtx_MatMatHipsparse *mmdata;
2226: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2227: CsrMatrix *csrmat;
2229: PetscFunctionBegin;
2230: MatCheckProduct(C, 1);
2231: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2232: mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
2233: A = product->A;
2234: B = product->B;
2235: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2236: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2237: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2238: Instead of silently accepting the wrong answer, I prefer to raise the error */
2239: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2240: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2241: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2242: switch (product->type) {
2243: case MATPRODUCT_AB:
2244: case MATPRODUCT_PtAP:
2245: mat = cusp->mat;
2246: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2247: m = A->rmap->n;
2248: n = B->cmap->n;
2249: break;
2250: case MATPRODUCT_AtB:
2251: if (!A->form_explicit_transpose) {
2252: mat = cusp->mat;
2253: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2254: } else {
2255: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2256: mat = cusp->matTranspose;
2257: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2258: }
2259: m = A->cmap->n;
2260: n = B->cmap->n;
2261: break;
2262: case MATPRODUCT_ABt:
2263: case MATPRODUCT_RARt:
2264: mat = cusp->mat;
2265: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2266: m = A->rmap->n;
2267: n = B->rmap->n;
2268: break;
2269: default:
2270: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2271: }
2272: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2273: csrmat = (CsrMatrix *)mat->mat;
2274: /* if the user passed a CPU matrix, copy the data to the GPU */
2275: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2276: if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
2277: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2278: PetscCall(MatDenseGetLDA(B, &blda));
2279: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2280: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2281: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2282: } else {
2283: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2284: PetscCall(MatDenseGetLDA(C, &clda));
2285: }
2287: PetscCall(PetscLogGpuTimeBegin());
2288: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2289: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2290: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2291: size_t mmBufferSize;
2292: if (mmdata->initialized && mmdata->Blda != blda) {
2293: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2294: mmdata->matBDescr = NULL;
2295: }
2296: if (!mmdata->matBDescr) {
2297: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2298: mmdata->Blda = blda;
2299: }
2300: if (mmdata->initialized && mmdata->Clda != clda) {
2301: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2302: mmdata->matCDescr = NULL;
2303: }
2304: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2305: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2306: mmdata->Clda = clda;
2307: }
2308: if (!mat->matDescr) {
2309: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2310: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2311: }
2312: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2313: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2314: PetscCallHIP(hipFree(mmdata->mmBuffer));
2315: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2316: mmdata->mmBufferSize = mmBufferSize;
2317: }
2318: mmdata->initialized = PETSC_TRUE;
2319: } else {
2320: /* to be safe, always update pointers of the mats */
2321: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2322: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2323: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2324: }
2326: /* do hipsparseSpMM, which supports transpose on B */
2327: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2329: PetscCall(PetscLogGpuTimeEnd());
2330: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2331: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2332: if (product->type == MATPRODUCT_RARt) {
2333: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2334: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2335: } else if (product->type == MATPRODUCT_PtAP) {
2336: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2337: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2338: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2339: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2340: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2341: PetscFunctionReturn(PETSC_SUCCESS);
2342: }
2344: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2345: {
2346: Mat_Product *product = C->product;
2347: Mat A, B;
2348: PetscInt m, n;
2349: PetscBool cisdense, flg;
2350: MatProductCtx_MatMatHipsparse *mmdata;
2351: Mat_SeqAIJHIPSPARSE *cusp;
2353: PetscFunctionBegin;
2354: MatCheckProduct(C, 1);
2355: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2356: A = product->A;
2357: B = product->B;
2358: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2359: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2360: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2361: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2362: switch (product->type) {
2363: case MATPRODUCT_AB:
2364: m = A->rmap->n;
2365: n = B->cmap->n;
2366: break;
2367: case MATPRODUCT_AtB:
2368: m = A->cmap->n;
2369: n = B->cmap->n;
2370: break;
2371: case MATPRODUCT_ABt:
2372: m = A->rmap->n;
2373: n = B->rmap->n;
2374: break;
2375: case MATPRODUCT_PtAP:
2376: m = B->cmap->n;
2377: n = B->cmap->n;
2378: break;
2379: case MATPRODUCT_RARt:
2380: m = B->rmap->n;
2381: n = B->rmap->n;
2382: break;
2383: default:
2384: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2385: }
2386: PetscCall(MatSetSizes(C, m, n, m, n));
2387: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2388: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2389: PetscCall(MatSetType(C, MATSEQDENSEHIP));
2391: /* product data */
2392: PetscCall(PetscNew(&mmdata));
2393: mmdata->cisdense = cisdense;
2394: /* for these products we need intermediate storage */
2395: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2396: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2397: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2398: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2399: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2400: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2401: }
2402: C->product->data = mmdata;
2403: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2404: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2405: PetscFunctionReturn(PETSC_SUCCESS);
2406: }
2408: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2409: {
2410: Mat_Product *product = C->product;
2411: Mat A, B;
2412: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2413: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2414: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2415: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2416: PetscBool flg;
2417: MatProductType ptype;
2418: MatProductCtx_MatMatHipsparse *mmdata;
2419: hipsparseSpMatDescr_t BmatSpDescr;
2420: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2422: PetscFunctionBegin;
2423: MatCheckProduct(C, 1);
2424: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2425: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2426: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2427: mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
2428: A = product->A;
2429: B = product->B;
2430: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2431: mmdata->reusesym = PETSC_FALSE;
2432: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2433: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2434: Cmat = Ccusp->mat;
2435: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2436: Ccsr = (CsrMatrix *)Cmat->mat;
2437: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2438: goto finalize;
2439: }
2440: if (!c->nz) goto finalize;
2441: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2442: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2443: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2444: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2445: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2446: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2447: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2448: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2449: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2450: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2451: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2452: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2453: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2454: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2456: ptype = product->type;
2457: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2458: ptype = MATPRODUCT_AB;
2459: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2460: }
2461: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2462: ptype = MATPRODUCT_AB;
2463: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2464: }
2465: switch (ptype) {
2466: case MATPRODUCT_AB:
2467: Amat = Acusp->mat;
2468: Bmat = Bcusp->mat;
2469: break;
2470: case MATPRODUCT_AtB:
2471: Amat = Acusp->matTranspose;
2472: Bmat = Bcusp->mat;
2473: break;
2474: case MATPRODUCT_ABt:
2475: Amat = Acusp->mat;
2476: Bmat = Bcusp->matTranspose;
2477: break;
2478: default:
2479: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2480: }
2481: Cmat = Ccusp->mat;
2482: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2483: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2484: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2485: Acsr = (CsrMatrix *)Amat->mat;
2486: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2487: Ccsr = (CsrMatrix *)Cmat->mat;
2488: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2489: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2490: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2491: PetscCall(PetscLogGpuTimeBegin());
2492: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2493: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2494: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2495: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2496: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2497: #else
2498: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2499: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2500: #endif
2501: #else
2502: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2503: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2504: Ccsr->column_indices->data().get()));
2505: #endif
2506: PetscCall(PetscLogGpuFlops(mmdata->flops));
2507: PetscCallHIP(WaitForHIP());
2508: PetscCall(PetscLogGpuTimeEnd());
2509: C->offloadmask = PETSC_OFFLOAD_GPU;
2510: finalize:
2511: /* shorter version of MatAssemblyEnd_SeqAIJ */
2512: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2513: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2514: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2515: c->reallocs = 0;
2516: C->info.mallocs += 0;
2517: C->info.nz_unneeded = 0;
2518: C->assembled = C->was_assembled = PETSC_TRUE;
2519: C->num_ass++;
2520: PetscFunctionReturn(PETSC_SUCCESS);
2521: }
2523: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2524: {
2525: Mat_Product *product = C->product;
2526: Mat A, B;
2527: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2528: Mat_SeqAIJ *a, *b, *c;
2529: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2530: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2531: PetscInt i, j, m, n, k;
2532: PetscBool flg;
2533: MatProductType ptype;
2534: MatProductCtx_MatMatHipsparse *mmdata;
2535: PetscLogDouble flops;
2536: PetscBool biscompressed, ciscompressed;
2537: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2538: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2539: hipsparseSpMatDescr_t BmatSpDescr;
2540: #else
2541: int cnz;
2542: #endif
2543: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2545: PetscFunctionBegin;
2546: MatCheckProduct(C, 1);
2547: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2548: A = product->A;
2549: B = product->B;
2550: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2551: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2552: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2553: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2554: a = (Mat_SeqAIJ *)A->data;
2555: b = (Mat_SeqAIJ *)B->data;
2556: /* product data */
2557: PetscCall(PetscNew(&mmdata));
2558: C->product->data = mmdata;
2559: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2561: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2562: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2563: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2564: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2565: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2566: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2568: ptype = product->type;
2569: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2570: ptype = MATPRODUCT_AB;
2571: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2572: }
2573: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2574: ptype = MATPRODUCT_AB;
2575: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2576: }
2577: biscompressed = PETSC_FALSE;
2578: ciscompressed = PETSC_FALSE;
2579: switch (ptype) {
2580: case MATPRODUCT_AB:
2581: m = A->rmap->n;
2582: n = B->cmap->n;
2583: k = A->cmap->n;
2584: Amat = Acusp->mat;
2585: Bmat = Bcusp->mat;
2586: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2587: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2588: break;
2589: case MATPRODUCT_AtB:
2590: m = A->cmap->n;
2591: n = B->cmap->n;
2592: k = A->rmap->n;
2593: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2594: Amat = Acusp->matTranspose;
2595: Bmat = Bcusp->mat;
2596: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2597: break;
2598: case MATPRODUCT_ABt:
2599: m = A->rmap->n;
2600: n = B->rmap->n;
2601: k = A->cmap->n;
2602: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2603: Amat = Acusp->mat;
2604: Bmat = Bcusp->matTranspose;
2605: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2606: break;
2607: default:
2608: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2609: }
2611: /* create hipsparse matrix */
2612: PetscCall(MatSetSizes(C, m, n, m, n));
2613: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2614: c = (Mat_SeqAIJ *)C->data;
2615: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2616: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2617: Ccsr = new CsrMatrix;
2619: c->compressedrow.use = ciscompressed;
2620: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2621: c->compressedrow.nrows = a->compressedrow.nrows;
2622: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2623: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2624: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2625: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2626: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2627: } else {
2628: c->compressedrow.nrows = 0;
2629: c->compressedrow.i = NULL;
2630: c->compressedrow.rindex = NULL;
2631: Ccusp->workVector = NULL;
2632: Cmat->cprowIndices = NULL;
2633: }
2634: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2635: Ccusp->mat = Cmat;
2636: Ccusp->mat->mat = Ccsr;
2637: Ccsr->num_rows = Ccusp->nrows;
2638: Ccsr->num_cols = n;
2639: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2640: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2641: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2642: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2643: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2644: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2645: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2646: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2647: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2648: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2649: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2650: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2651: c->nz = 0;
2652: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2653: Ccsr->values = new THRUSTARRAY(c->nz);
2654: goto finalizesym;
2655: }
2657: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2658: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2659: Acsr = (CsrMatrix *)Amat->mat;
2660: if (!biscompressed) {
2661: Bcsr = (CsrMatrix *)Bmat->mat;
2662: BmatSpDescr = Bmat->matDescr;
2663: } else { /* we need to use row offsets for the full matrix */
2664: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2665: Bcsr = new CsrMatrix;
2666: Bcsr->num_rows = B->rmap->n;
2667: Bcsr->num_cols = cBcsr->num_cols;
2668: Bcsr->num_entries = cBcsr->num_entries;
2669: Bcsr->column_indices = cBcsr->column_indices;
2670: Bcsr->values = cBcsr->values;
2671: if (!Bcusp->rowoffsets_gpu) {
2672: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2673: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2674: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2675: }
2676: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2677: mmdata->Bcsr = Bcsr;
2678: if (Bcsr->num_rows && Bcsr->num_cols) {
2679: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2680: }
2681: BmatSpDescr = mmdata->matSpBDescr;
2682: }
2683: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2684: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2685: /* precompute flops count */
2686: if (ptype == MATPRODUCT_AB) {
2687: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2688: const PetscInt st = a->i[i];
2689: const PetscInt en = a->i[i + 1];
2690: for (j = st; j < en; j++) {
2691: const PetscInt brow = a->j[j];
2692: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2693: }
2694: }
2695: } else if (ptype == MATPRODUCT_AtB) {
2696: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2697: const PetscInt anzi = a->i[i + 1] - a->i[i];
2698: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2699: flops += (2. * anzi) * bnzi;
2700: }
2701: } else flops = 0.; /* TODO */
2703: mmdata->flops = flops;
2704: PetscCall(PetscLogGpuTimeBegin());
2705: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2706: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2707: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2708: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2709: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2710: {
2711: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2712: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2713: */
2714: void *dBuffer1 = NULL;
2715: void *dBuffer2 = NULL;
2716: void *dBuffer3 = NULL;
2717: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2718: size_t bufferSize1 = 0;
2719: size_t bufferSize2 = 0;
2720: size_t bufferSize3 = 0;
2721: size_t bufferSize4 = 0;
2722: size_t bufferSize5 = 0;
2724: /* ask bufferSize1 bytes for external memory */
2725: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2726: PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2727: /* inspect the matrices A and B to understand the memory requirement for the next step */
2728: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2730: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2731: PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2732: PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2733: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2734: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2735: PetscCallHIP(hipFree(dBuffer1));
2736: PetscCallHIP(hipFree(dBuffer2));
2738: /* get matrix C non-zero entries C_nnz1 */
2739: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2740: c->nz = (PetscInt)C_nnz1;
2741: /* allocate matrix C */
2742: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2743: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2744: Ccsr->values = new THRUSTARRAY(c->nz);
2745: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2746: /* update matC with the new pointers */
2747: if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2748: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2750: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2751: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2752: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2753: PetscCallHIP(hipFree(dBuffer3));
2754: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2755: }
2756: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2757: }
2758: #else
2759: size_t bufSize2;
2760: /* ask bufferSize bytes for external memory */
2761: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2762: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2763: /* inspect the matrices A and B to understand the memory requirement for the next step */
2764: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2765: /* ask bufferSize again bytes for external memory */
2766: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2767: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2768: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2769: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2770: is stored in the descriptor! What a messy API... */
2771: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2772: /* compute the intermediate product of A * B */
2773: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2774: /* get matrix C non-zero entries C_nnz1 */
2775: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2776: c->nz = (PetscInt)C_nnz1;
2777: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2778: mmdata->mmBufferSize / 1024));
2779: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2780: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2781: Ccsr->values = new THRUSTARRAY(c->nz);
2782: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2783: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2784: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2785: #endif
2786: #else
2787: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2788: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2789: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2790: c->nz = cnz;
2791: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2792: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2793: Ccsr->values = new THRUSTARRAY(c->nz);
2794: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2796: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2797: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2798: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2799: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2800: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2801: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2802: Ccsr->column_indices->data().get()));
2803: #endif
2804: PetscCall(PetscLogGpuFlops(mmdata->flops));
2805: PetscCall(PetscLogGpuTimeEnd());
2806: finalizesym:
2807: c->free_a = PETSC_TRUE;
2808: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2809: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2810: c->free_ij = PETSC_TRUE;
2811: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2812: PetscInt *d_i = c->i;
2813: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2814: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2815: ii = *Ccsr->row_offsets;
2816: jj = *Ccsr->column_indices;
2817: if (ciscompressed) d_i = c->compressedrow.i;
2818: PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2819: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2820: } else {
2821: PetscInt *d_i = c->i;
2822: if (ciscompressed) d_i = c->compressedrow.i;
2823: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2824: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2825: }
2826: if (ciscompressed) { /* need to expand host row offsets */
2827: PetscInt r = 0;
2828: c->i[0] = 0;
2829: for (k = 0; k < c->compressedrow.nrows; k++) {
2830: const PetscInt next = c->compressedrow.rindex[k];
2831: const PetscInt old = c->compressedrow.i[k];
2832: for (; r < next; r++) c->i[r + 1] = old;
2833: }
2834: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2835: }
2836: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2837: PetscCall(PetscMalloc1(m, &c->ilen));
2838: PetscCall(PetscMalloc1(m, &c->imax));
2839: c->maxnz = c->nz;
2840: c->nonzerorowcnt = 0;
2841: c->rmax = 0;
2842: for (k = 0; k < m; k++) {
2843: const PetscInt nn = c->i[k + 1] - c->i[k];
2844: c->ilen[k] = c->imax[k] = nn;
2845: c->nonzerorowcnt += (PetscInt)!!nn;
2846: c->rmax = PetscMax(c->rmax, nn);
2847: }
2848: PetscCall(PetscMalloc1(c->nz, &c->a));
2849: Ccsr->num_entries = c->nz;
2851: C->nonzerostate++;
2852: PetscCall(PetscLayoutSetUp(C->rmap));
2853: PetscCall(PetscLayoutSetUp(C->cmap));
2854: Ccusp->nonzerostate = C->nonzerostate;
2855: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2856: C->preallocated = PETSC_TRUE;
2857: C->assembled = PETSC_FALSE;
2858: C->was_assembled = PETSC_FALSE;
2859: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2860: mmdata->reusesym = PETSC_TRUE;
2861: C->offloadmask = PETSC_OFFLOAD_GPU;
2862: }
2863: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2864: PetscFunctionReturn(PETSC_SUCCESS);
2865: }
2867: /* handles sparse or dense B */
2868: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2869: {
2870: Mat_Product *product = mat->product;
2871: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2873: PetscFunctionBegin;
2874: MatCheckProduct(mat, 1);
2875: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2876: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2877: if (product->type == MATPRODUCT_ABC) {
2878: Ciscusp = PETSC_FALSE;
2879: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2880: }
2881: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2882: PetscBool usecpu = PETSC_FALSE;
2883: switch (product->type) {
2884: case MATPRODUCT_AB:
2885: if (product->api_user) {
2886: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2887: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2888: PetscOptionsEnd();
2889: } else {
2890: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2891: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2892: PetscOptionsEnd();
2893: }
2894: break;
2895: case MATPRODUCT_AtB:
2896: if (product->api_user) {
2897: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2898: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2899: PetscOptionsEnd();
2900: } else {
2901: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2902: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2903: PetscOptionsEnd();
2904: }
2905: break;
2906: case MATPRODUCT_PtAP:
2907: if (product->api_user) {
2908: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2909: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2910: PetscOptionsEnd();
2911: } else {
2912: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2913: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2914: PetscOptionsEnd();
2915: }
2916: break;
2917: case MATPRODUCT_RARt:
2918: if (product->api_user) {
2919: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2920: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2921: PetscOptionsEnd();
2922: } else {
2923: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2924: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2925: PetscOptionsEnd();
2926: }
2927: break;
2928: case MATPRODUCT_ABC:
2929: if (product->api_user) {
2930: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2931: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2932: PetscOptionsEnd();
2933: } else {
2934: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2935: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2936: PetscOptionsEnd();
2937: }
2938: break;
2939: default:
2940: break;
2941: }
2942: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2943: }
2944: /* dispatch */
2945: if (isdense) {
2946: switch (product->type) {
2947: case MATPRODUCT_AB:
2948: case MATPRODUCT_AtB:
2949: case MATPRODUCT_ABt:
2950: case MATPRODUCT_PtAP:
2951: case MATPRODUCT_RARt:
2952: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2953: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2954: break;
2955: case MATPRODUCT_ABC:
2956: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2957: break;
2958: default:
2959: break;
2960: }
2961: } else if (Biscusp && Ciscusp) {
2962: switch (product->type) {
2963: case MATPRODUCT_AB:
2964: case MATPRODUCT_AtB:
2965: case MATPRODUCT_ABt:
2966: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2967: break;
2968: case MATPRODUCT_PtAP:
2969: case MATPRODUCT_RARt:
2970: case MATPRODUCT_ABC:
2971: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2972: break;
2973: default:
2974: break;
2975: }
2976: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2977: PetscFunctionReturn(PETSC_SUCCESS);
2978: }
2980: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2981: {
2982: PetscFunctionBegin;
2983: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2984: PetscFunctionReturn(PETSC_SUCCESS);
2985: }
2987: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2988: {
2989: PetscFunctionBegin;
2990: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2991: PetscFunctionReturn(PETSC_SUCCESS);
2992: }
2994: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2995: {
2996: PetscFunctionBegin;
2997: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2998: PetscFunctionReturn(PETSC_SUCCESS);
2999: }
3001: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3002: {
3003: PetscFunctionBegin;
3004: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3005: PetscFunctionReturn(PETSC_SUCCESS);
3006: }
3008: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3009: {
3010: PetscFunctionBegin;
3011: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3012: PetscFunctionReturn(PETSC_SUCCESS);
3013: }
3015: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3016: {
3017: int i = blockIdx.x * blockDim.x + threadIdx.x;
3018: if (i < n) y[idx[i]] += x[i];
3019: }
3021: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3022: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3023: {
3024: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3025: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3026: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3027: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3028: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3029: PetscBool compressed;
3030: PetscInt nx, ny;
3032: PetscFunctionBegin;
3033: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3034: if (!a->nz) {
3035: if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3036: else PetscCall(VecSeq_HIP::Set(zz, 0));
3037: PetscFunctionReturn(PETSC_SUCCESS);
3038: }
3039: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3040: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3041: if (!trans) {
3042: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3043: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3044: } else {
3045: if (herm || !A->form_explicit_transpose) {
3046: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3047: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3048: } else {
3049: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3050: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3051: }
3052: }
3053: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3054: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3055: try {
3056: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3057: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3058: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3060: PetscCall(PetscLogGpuTimeBegin());
3061: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3062: /* z = A x + beta y.
3063: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3064: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3065: */
3066: xptr = xarray;
3067: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3068: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3069: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3070: allocated to accommodate different uses. So we get the length info directly from mat.
3071: */
3072: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3073: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3074: nx = mat->num_cols;
3075: ny = mat->num_rows;
3076: }
3077: } else {
3078: /* z = A^T x + beta y
3079: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3080: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3081: */
3082: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3083: dptr = zarray;
3084: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3085: if (compressed) { /* Scatter x to work vector */
3086: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3087: thrust::for_each(
3088: #if PetscDefined(HAVE_THRUST_ASYNC)
3089: thrust::hip::par.on(PetscDefaultHipStream),
3090: #endif
3091: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3092: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3093: }
3094: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3095: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3096: nx = mat->num_rows;
3097: ny = mat->num_cols;
3098: }
3099: }
3100: /* csr_spmv does y = alpha op(A) x + beta y */
3101: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3102: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !(PETSC_PKG_HIP_VERSION_GT(6, 4, 3) && PETSC_PKG_HIP_VERSION_LE(7, 2, 0))
3103: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3104: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3105: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3106: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3107: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3108: &matstruct->hipSpMV[opA].spmvBufferSize));
3109: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3110: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3111: } else {
3112: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3113: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3114: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3115: }
3116: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3117: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3118: #else
3119: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3120: nx = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
3121: ny = mat->num_cols;
3122: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3123: #endif
3124: } else {
3125: if (hipsparsestruct->nrows) {
3126: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3127: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3128: }
3129: }
3130: PetscCall(PetscLogGpuTimeEnd());
3132: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3133: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3134: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3135: PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
3136: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3137: PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3138: }
3139: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3140: PetscCall(VecSeq_HIP::Set(zz, 0));
3141: }
3143: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3144: if (compressed) {
3145: PetscCall(PetscLogGpuTimeBegin());
3146: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3147: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3148: prevent that. So I just add a ScatterAdd kernel.
3149: */
3150: #if 0
3151: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3152: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3153: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3154: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3155: VecHIPPlusEquals());
3156: #else
3157: PetscInt n = matstruct->cprowIndices->size();
3158: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3159: #endif
3160: PetscCall(PetscLogGpuTimeEnd());
3161: }
3162: } else {
3163: if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3164: }
3165: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3166: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3167: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3168: } catch (char *ex) {
3169: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3170: }
3171: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3172: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3173: PetscFunctionReturn(PETSC_SUCCESS);
3174: }
3176: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3177: {
3178: PetscFunctionBegin;
3179: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3180: PetscFunctionReturn(PETSC_SUCCESS);
3181: }
3183: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3184: {
3185: PetscFunctionBegin;
3186: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3187: PetscFunctionReturn(PETSC_SUCCESS);
3188: }
3190: /*@
3191: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3192: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3194: Collective
3196: Input Parameters:
3197: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3198: . m - number of rows
3199: . n - number of columns
3200: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3201: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3203: Output Parameter:
3204: . A - the matrix
3206: Level: intermediate
3208: Notes:
3209: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3210: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3211: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3213: The AIJ format (compressed row storage), is fully compatible with standard Fortran
3214: storage. That is, the stored row and column indices can begin at
3215: either one (as in Fortran) or zero.
3217: Specify the preallocated storage with either `nz` or `nnz` (not both).
3218: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3219: allocation.
3221: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3222: @*/
3223: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3224: {
3225: PetscFunctionBegin;
3226: PetscCall(MatCreate(comm, A));
3227: PetscCall(MatSetSizes(*A, m, n, m, n));
3228: PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3229: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3230: PetscFunctionReturn(PETSC_SUCCESS);
3231: }
3233: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3234: {
3235: PetscFunctionBegin;
3236: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3237: else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3238: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3239: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3240: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3241: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3242: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3243: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3244: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3245: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3246: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3247: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3248: PetscCall(MatDestroy_SeqAIJ(A));
3249: PetscFunctionReturn(PETSC_SUCCESS);
3250: }
3252: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3253: {
3254: PetscFunctionBegin;
3255: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3256: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3257: PetscFunctionReturn(PETSC_SUCCESS);
3258: }
3260: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3261: {
3262: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3263: Mat_SeqAIJHIPSPARSE *cy;
3264: Mat_SeqAIJHIPSPARSE *cx;
3265: PetscScalar *ay;
3266: const PetscScalar *ax;
3267: CsrMatrix *csry, *csrx;
3269: PetscFunctionBegin;
3270: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3271: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3272: if (X->ops->axpy != Y->ops->axpy) {
3273: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3274: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3275: PetscFunctionReturn(PETSC_SUCCESS);
3276: }
3277: /* if we are here, it means both matrices are bound to GPU */
3278: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3279: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3280: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3281: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3282: csry = (CsrMatrix *)cy->mat->mat;
3283: csrx = (CsrMatrix *)cx->mat->mat;
3284: /* see if we can turn this into a hipblas axpy */
3285: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3286: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3287: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3288: if (eq) str = SAME_NONZERO_PATTERN;
3289: }
3290: /* spgeam is buggy with one column */
3291: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3292: if (str == SUBSET_NONZERO_PATTERN) {
3293: PetscScalar b = 1.0;
3294: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3295: size_t bufferSize;
3296: void *buffer;
3297: #endif
3299: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3300: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3301: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3302: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3303: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3304: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3305: PetscCallHIP(hipMalloc(&buffer, bufferSize));
3306: PetscCall(PetscLogGpuTimeBegin());
3307: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3308: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3309: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3310: PetscCall(PetscLogGpuTimeEnd());
3311: PetscCallHIP(hipFree(buffer));
3312: #else
3313: PetscCall(PetscLogGpuTimeBegin());
3314: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3315: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3316: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3317: PetscCall(PetscLogGpuTimeEnd());
3318: #endif
3319: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3320: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3321: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3322: } else if (str == SAME_NONZERO_PATTERN) {
3323: hipblasHandle_t hipblasv2handle;
3324: PetscBLASInt one = 1, bnz = 1;
3326: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3327: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3328: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3329: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3330: PetscCall(PetscLogGpuTimeBegin());
3331: PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3332: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3333: PetscCall(PetscLogGpuTimeEnd());
3334: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3335: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3336: } else {
3337: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3338: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3339: }
3340: PetscFunctionReturn(PETSC_SUCCESS);
3341: }
3343: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3344: {
3345: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3346: PetscScalar *ay;
3347: hipblasHandle_t hipblasv2handle;
3348: PetscBLASInt one = 1, bnz = 1;
3350: PetscFunctionBegin;
3351: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3352: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3353: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3354: PetscCall(PetscLogGpuTimeBegin());
3355: PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3356: PetscCall(PetscLogGpuFlops(bnz));
3357: PetscCall(PetscLogGpuTimeEnd());
3358: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3359: PetscFunctionReturn(PETSC_SUCCESS);
3360: }
3362: struct DiagonalScaleLeft {
3363: const PetscScalar *lv_ptr;
3364: PetscScalar *val_ptr;
3365: const int *row_ptr;
3366: const PetscInt *cprow_ptr;
3367: __host__ __device__ void operator()(int i) const
3368: {
3369: const int row = cprow_ptr ? (int)cprow_ptr[i] : i;
3370: const PetscScalar s = lv_ptr[row];
3371: for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) val_ptr[j] *= s;
3372: }
3373: };
3375: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat A, Vec l, Vec r)
3376: {
3377: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)A->data;
3378: CsrMatrix *csr;
3379: const PetscScalar *v;
3380: PetscScalar *av;
3381: PetscInt m, n;
3383: PetscFunctionBegin;
3384: PetscCall(PetscLogGpuTimeBegin());
3385: PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &av));
3386: csr = (CsrMatrix *)((Mat_SeqAIJHIPSPARSE *)A->spptr)->mat->mat;
3387: if (l) {
3388: const PetscInt *cprow = ((Mat_SeqAIJHIPSPARSE *)A->spptr)->mat->cprowIndices ? ((Mat_SeqAIJHIPSPARSE *)A->spptr)->mat->cprowIndices->data().get() : NULL;
3389: DiagonalScaleLeft functor;
3391: PetscCall(VecGetLocalSize(l, &m));
3392: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling Vec of wrong length");
3393: PetscCall(VecHIPGetArrayRead(l, &v));
3394: functor = {v, av, csr->row_offsets->data().get(), cprow};
3395: PetscCallThrust(thrust::for_each(thrust::hip::par.on(PetscDefaultHipStream), thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(csr->num_rows), functor));
3396: PetscCall(VecHIPRestoreArrayRead(l, &v));
3397: PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
3398: }
3399: PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &av));
3400: if (r) {
3401: PetscCall(VecGetLocalSize(r, &n));
3402: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling Vec of wrong length");
3403: PetscCall(VecHIPGetArrayRead(r, &v));
3404: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(v), csr->column_indices->begin()), csr->values->begin(), thrust::multiplies<PetscScalar>()));
3405: PetscCall(VecHIPRestoreArrayRead(r, &v));
3406: PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
3407: }
3408: PetscCall(PetscLogGpuTimeEnd());
3409: PetscFunctionReturn(PETSC_SUCCESS);
3410: }
3412: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3413: {
3414: PetscBool both = PETSC_FALSE;
3415: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3417: PetscFunctionBegin;
3418: if (A->factortype == MAT_FACTOR_NONE) {
3419: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3420: if (spptr->mat) {
3421: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3422: if (matrix->values) {
3423: both = PETSC_TRUE;
3424: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3425: }
3426: }
3427: if (spptr->matTranspose) {
3428: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3429: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3430: }
3431: }
3432: //PetscCall(MatZeroEntries_SeqAIJ(A));
3433: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3434: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3435: else A->offloadmask = PETSC_OFFLOAD_CPU;
3436: PetscFunctionReturn(PETSC_SUCCESS);
3437: }
3439: static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3440: {
3441: PetscFunctionBegin;
3442: *m = PETSC_MEMTYPE_HIP;
3443: PetscFunctionReturn(PETSC_SUCCESS);
3444: }
3446: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3447: {
3448: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3450: PetscFunctionBegin;
3451: if (A->factortype != MAT_FACTOR_NONE) {
3452: A->boundtocpu = flg;
3453: PetscFunctionReturn(PETSC_SUCCESS);
3454: }
3455: if (flg) {
3456: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3458: A->ops->scale = MatScale_SeqAIJ;
3459: A->ops->diagonalscale = MatDiagonalScale_SeqAIJ;
3460: A->ops->axpy = MatAXPY_SeqAIJ;
3461: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3462: A->ops->mult = MatMult_SeqAIJ;
3463: A->ops->multadd = MatMultAdd_SeqAIJ;
3464: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3465: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3466: A->ops->multhermitiantranspose = NULL;
3467: A->ops->multhermitiantransposeadd = NULL;
3468: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3469: A->ops->getcurrentmemtype = NULL;
3470: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3471: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3472: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3473: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3474: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3475: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3476: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3477: } else {
3478: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3479: A->ops->diagonalscale = MatDiagonalScale_SeqAIJHIPSPARSE;
3480: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3481: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3482: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3483: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3484: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3485: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3486: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3487: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3488: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3489: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3490: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3491: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3492: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3493: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3494: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3495: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3496: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3497: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3498: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3499: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3500: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3501: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3502: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3503: }
3504: A->boundtocpu = flg;
3505: if (flg && a->inode.size_csr) a->inode.use = PETSC_TRUE;
3506: else a->inode.use = PETSC_FALSE;
3507: PetscFunctionReturn(PETSC_SUCCESS);
3508: }
3510: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3511: {
3512: Mat B;
3514: PetscFunctionBegin;
3515: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3516: if (reuse == MAT_INITIAL_MATRIX) {
3517: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3518: } else if (reuse == MAT_REUSE_MATRIX) {
3519: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3520: }
3521: B = *newmat;
3522: PetscCall(PetscFree(B->defaultvectype));
3523: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3524: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3525: if (B->factortype == MAT_FACTOR_NONE) {
3526: Mat_SeqAIJHIPSPARSE *spptr;
3527: PetscCall(PetscNew(&spptr));
3528: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3529: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3530: spptr->format = MAT_HIPSPARSE_CSR;
3531: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3532: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3533: #else
3534: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3535: #endif
3536: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3537: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3539: B->spptr = spptr;
3540: } else {
3541: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3543: PetscCall(PetscNew(&spptr));
3544: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3545: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3546: B->spptr = spptr;
3547: }
3548: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3549: }
3550: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3551: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3552: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3553: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3554: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3555: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3556: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
3558: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3559: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3560: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3561: #if defined(PETSC_HAVE_HYPRE)
3562: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3563: #endif
3564: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3565: PetscFunctionReturn(PETSC_SUCCESS);
3566: }
3568: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3569: {
3570: PetscFunctionBegin;
3571: PetscCall(MatCreate_SeqAIJ(B));
3572: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3573: PetscFunctionReturn(PETSC_SUCCESS);
3574: }
3576: /*MC
3577: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3579: A matrix type whose data resides on AMD GPUs. These matrices can be in either
3580: CSR, ELL, or Hybrid format.
3581: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3583: Options Database Keys:
3584: + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3585: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3586: Other options include ell (ellpack) or hyb (hybrid).
3587: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3588: - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3590: Level: beginner
3592: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3593: M*/
3595: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3596: {
3597: PetscFunctionBegin;
3598: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3599: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3600: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3601: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3602: PetscFunctionReturn(PETSC_SUCCESS);
3603: }
3605: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3606: {
3607: Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3609: PetscFunctionBegin;
3610: if (cusp) {
3611: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3612: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3613: delete cusp->workVector;
3614: delete cusp->rowoffsets_gpu;
3615: delete cusp->csr2csc_i;
3616: delete cusp->coords;
3617: if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3618: PetscCall(PetscFree(mat->spptr));
3619: }
3620: PetscFunctionReturn(PETSC_SUCCESS);
3621: }
3623: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3624: {
3625: PetscFunctionBegin;
3626: if (*mat) {
3627: delete (*mat)->values;
3628: delete (*mat)->column_indices;
3629: delete (*mat)->row_offsets;
3630: delete *mat;
3631: *mat = 0;
3632: }
3633: PetscFunctionReturn(PETSC_SUCCESS);
3634: }
3636: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3637: {
3638: PetscFunctionBegin;
3639: if (*trifactor) {
3640: if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3641: if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3642: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3643: PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3644: PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3645: PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3646: PetscCall(PetscFree(*trifactor));
3647: }
3648: PetscFunctionReturn(PETSC_SUCCESS);
3649: }
3651: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3652: {
3653: CsrMatrix *mat;
3655: PetscFunctionBegin;
3656: if (*matstruct) {
3657: if ((*matstruct)->mat) {
3658: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3659: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3660: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3661: } else {
3662: mat = (CsrMatrix *)(*matstruct)->mat;
3663: PetscCall(CsrMatrix_Destroy(&mat));
3664: }
3665: }
3666: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3667: delete (*matstruct)->cprowIndices;
3668: PetscCallHIP(hipFree((*matstruct)->alpha_one));
3669: PetscCallHIP(hipFree((*matstruct)->beta_zero));
3670: PetscCallHIP(hipFree((*matstruct)->beta_one));
3672: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3673: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3674: for (int i = 0; i < 3; i++) {
3675: if (mdata->hipSpMV[i].initialized) {
3676: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3677: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3678: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3679: }
3680: }
3681: delete *matstruct;
3682: *matstruct = NULL;
3683: }
3684: PetscFunctionReturn(PETSC_SUCCESS);
3685: }
3687: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3688: {
3689: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3691: PetscFunctionBegin;
3692: if (fs) {
3693: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3694: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3695: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3696: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3697: delete fs->rpermIndices;
3698: delete fs->cpermIndices;
3699: delete fs->workVector;
3700: fs->rpermIndices = NULL;
3701: fs->cpermIndices = NULL;
3702: fs->workVector = NULL;
3703: fs->init_dev_prop = PETSC_FALSE;
3704: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3705: PetscCallHIP(hipFree(fs->csrRowPtr));
3706: PetscCallHIP(hipFree(fs->csrColIdx));
3707: PetscCallHIP(hipFree(fs->csrVal));
3708: PetscCallHIP(hipFree(fs->X));
3709: PetscCallHIP(hipFree(fs->Y));
3710: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3711: PetscCallHIP(hipFree(fs->spsvBuffer_L));
3712: PetscCallHIP(hipFree(fs->spsvBuffer_U));
3713: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3714: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3715: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3716: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3717: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3718: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3719: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3720: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3721: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3722: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3723: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3724: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3725: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3727: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3728: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3729: #endif
3730: }
3731: PetscFunctionReturn(PETSC_SUCCESS);
3732: }
3734: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3735: {
3736: hipsparseHandle_t handle;
3738: PetscFunctionBegin;
3739: if (*trifactors) {
3740: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3741: if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3742: PetscCall(PetscFree(*trifactors));
3743: }
3744: PetscFunctionReturn(PETSC_SUCCESS);
3745: }
3747: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3748: {
3749: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3751: PetscFunctionBegin;
3752: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3753: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3754: if (destroy) {
3755: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3756: delete cusp->csr2csc_i;
3757: cusp->csr2csc_i = NULL;
3758: }
3759: A->transupdated = PETSC_FALSE;
3760: PetscFunctionReturn(PETSC_SUCCESS);
3761: }
3763: static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(PetscCtxRt data)
3764: {
3765: MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)data;
3767: PetscFunctionBegin;
3768: PetscCallHIP(hipFree(coo->perm));
3769: PetscCallHIP(hipFree(coo->jmap));
3770: PetscCall(PetscFree(coo));
3771: PetscFunctionReturn(PETSC_SUCCESS);
3772: }
3774: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3775: {
3776: PetscBool dev_ij = PETSC_FALSE;
3777: PetscMemType mtype = PETSC_MEMTYPE_HOST;
3778: PetscInt *i, *j;
3779: PetscContainer container_h;
3780: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3782: PetscFunctionBegin;
3783: PetscCall(PetscGetMemType(coo_i, &mtype));
3784: if (PetscMemTypeDevice(mtype)) {
3785: dev_ij = PETSC_TRUE;
3786: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3787: PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3788: PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3789: } else {
3790: i = coo_i;
3791: j = coo_j;
3792: }
3793: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3794: if (dev_ij) PetscCall(PetscFree2(i, j));
3795: mat->offloadmask = PETSC_OFFLOAD_CPU;
3796: // Create the GPU memory
3797: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3799: // Copy the COO struct to device
3800: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3801: PetscCall(PetscContainerGetPointer(container_h, &coo_h));
3802: PetscCall(PetscMalloc1(1, &coo_d));
3803: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3804: PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3805: PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3806: PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3807: PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3809: // Put the COO struct in a container and then attach that to the matrix
3810: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3811: PetscFunctionReturn(PETSC_SUCCESS);
3812: }
3814: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3815: {
3816: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3817: const PetscCount grid_size = gridDim.x * blockDim.x;
3818: for (; i < nnz; i += grid_size) {
3819: PetscScalar sum = 0.0;
3820: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3821: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3822: }
3823: }
3825: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3826: {
3827: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3828: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3829: PetscCount Annz = seq->nz;
3830: PetscMemType memtype;
3831: const PetscScalar *v1 = v;
3832: PetscScalar *Aa;
3833: PetscContainer container;
3834: MatCOOStruct_SeqAIJ *coo;
3836: PetscFunctionBegin;
3837: if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3839: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3840: PetscCall(PetscContainerGetPointer(container, &coo));
3842: PetscCall(PetscGetMemType(v, &memtype));
3843: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3844: PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3845: PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3846: }
3848: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3849: else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3851: PetscCall(PetscLogGpuTimeBegin());
3852: if (Annz) {
3853: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3854: PetscCallHIP(hipPeekAtLastError());
3855: }
3856: PetscCall(PetscLogGpuTimeEnd());
3858: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3859: else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3861: if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3862: PetscFunctionReturn(PETSC_SUCCESS);
3863: }
3865: /*@C
3866: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3868: Not Collective
3870: Input Parameters:
3871: + A - the matrix
3872: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3874: Output Parameters:
3875: + i - the CSR row pointers
3876: - j - the CSR column indices
3878: Level: developer
3880: Note:
3881: When compressed is true, the CSR structure does not contain empty rows
3883: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3884: @*/
3885: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3886: {
3887: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3888: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3889: CsrMatrix *csr;
3891: PetscFunctionBegin;
3893: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3894: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3895: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3896: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3897: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3898: csr = (CsrMatrix *)cusp->mat->mat;
3899: if (i) {
3900: if (!compressed && a->compressedrow.use) { /* need full row offset */
3901: if (!cusp->rowoffsets_gpu) {
3902: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3903: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3904: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3905: }
3906: *i = cusp->rowoffsets_gpu->data().get();
3907: } else *i = csr->row_offsets->data().get();
3908: }
3909: if (j) *j = csr->column_indices->data().get();
3910: PetscFunctionReturn(PETSC_SUCCESS);
3911: }
3913: /*@C
3914: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3916: Not Collective
3918: Input Parameters:
3919: + A - the matrix
3920: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3921: . i - the CSR row pointers
3922: - j - the CSR column indices
3924: Level: developer
3926: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3927: @*/
3928: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3929: {
3930: PetscFunctionBegin;
3932: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3933: if (i) *i = NULL;
3934: if (j) *j = NULL;
3935: PetscFunctionReturn(PETSC_SUCCESS);
3936: }
3938: /*@C
3939: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3941: Not Collective
3943: Input Parameter:
3944: . A - a `MATSEQAIJHIPSPARSE` matrix
3946: Output Parameter:
3947: . a - pointer to the device data
3949: Level: developer
3951: Note:
3952: May trigger host-device copies if the up-to-date matrix data is on host
3954: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3955: @*/
3956: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3957: {
3958: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3959: CsrMatrix *csr;
3961: PetscFunctionBegin;
3963: PetscAssertPointer(a, 2);
3964: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3965: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3966: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3967: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3968: csr = (CsrMatrix *)cusp->mat->mat;
3969: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3970: *a = csr->values->data().get();
3971: PetscFunctionReturn(PETSC_SUCCESS);
3972: }
3974: /*@C
3975: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3977: Not Collective
3979: Input Parameters:
3980: + A - a `MATSEQAIJHIPSPARSE` matrix
3981: - a - pointer to the device data
3983: Level: developer
3985: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3986: @*/
3987: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3988: {
3989: PetscFunctionBegin;
3991: PetscAssertPointer(a, 2);
3992: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3993: *a = NULL;
3994: PetscFunctionReturn(PETSC_SUCCESS);
3995: }
3997: /*@C
3998: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4000: Not Collective
4002: Input Parameter:
4003: . A - a `MATSEQAIJHIPSPARSE` matrix
4005: Output Parameter:
4006: . a - pointer to the device data
4008: Level: developer
4010: Note:
4011: May trigger host-device copies if up-to-date matrix data is on host
4013: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
4014: @*/
4015: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
4016: {
4017: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4018: CsrMatrix *csr;
4020: PetscFunctionBegin;
4022: PetscAssertPointer(a, 2);
4023: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4024: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4025: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4026: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4027: csr = (CsrMatrix *)cusp->mat->mat;
4028: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4029: *a = csr->values->data().get();
4030: A->offloadmask = PETSC_OFFLOAD_GPU;
4031: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4032: PetscFunctionReturn(PETSC_SUCCESS);
4033: }
4034: /*@C
4035: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
4037: Not Collective
4039: Input Parameters:
4040: + A - a `MATSEQAIJHIPSPARSE` matrix
4041: - a - pointer to the device data
4043: Level: developer
4045: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4046: @*/
4047: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4048: {
4049: PetscFunctionBegin;
4051: PetscAssertPointer(a, 2);
4052: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4053: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4054: *a = NULL;
4055: PetscFunctionReturn(PETSC_SUCCESS);
4056: }
4058: /*@C
4059: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4061: Not Collective
4063: Input Parameter:
4064: . A - a `MATSEQAIJHIPSPARSE` matrix
4066: Output Parameter:
4067: . a - pointer to the device data
4069: Level: developer
4071: Note:
4072: Does not trigger host-device copies and flags data validity on the GPU
4074: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4075: @*/
4076: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4077: {
4078: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4079: CsrMatrix *csr;
4081: PetscFunctionBegin;
4083: PetscAssertPointer(a, 2);
4084: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4085: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4086: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4087: csr = (CsrMatrix *)cusp->mat->mat;
4088: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4089: *a = csr->values->data().get();
4090: A->offloadmask = PETSC_OFFLOAD_GPU;
4091: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4092: PetscFunctionReturn(PETSC_SUCCESS);
4093: }
4095: /*@C
4096: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4098: Not Collective
4100: Input Parameters:
4101: + A - a `MATSEQAIJHIPSPARSE` matrix
4102: - a - pointer to the device data
4104: Level: developer
4106: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4107: @*/
4108: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4109: {
4110: PetscFunctionBegin;
4112: PetscAssertPointer(a, 2);
4113: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4114: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4115: *a = NULL;
4116: PetscFunctionReturn(PETSC_SUCCESS);
4117: }
4119: struct IJCompare4 {
4120: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4121: {
4122: if (t1.get<0>() < t2.get<0>()) return true;
4123: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4124: return false;
4125: }
4126: };
4128: struct Shift {
4129: int _shift;
4131: Shift(int shift) : _shift(shift) { }
4132: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4133: };
4135: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4136: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4137: {
4138: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4139: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4140: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4141: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4142: PetscInt Annz, Bnnz;
4143: PetscInt i, m, n, zero = 0;
4145: PetscFunctionBegin;
4148: PetscAssertPointer(C, 4);
4149: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4150: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4151: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4152: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4153: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4154: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4155: if (reuse == MAT_INITIAL_MATRIX) {
4156: m = A->rmap->n;
4157: n = A->cmap->n + B->cmap->n;
4158: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4159: PetscCall(MatSetSizes(*C, m, n, m, n));
4160: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4161: c = (Mat_SeqAIJ *)(*C)->data;
4162: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4163: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4164: Ccsr = new CsrMatrix;
4165: Cmat->cprowIndices = NULL;
4166: c->compressedrow.use = PETSC_FALSE;
4167: c->compressedrow.nrows = 0;
4168: c->compressedrow.i = NULL;
4169: c->compressedrow.rindex = NULL;
4170: Ccusp->workVector = NULL;
4171: Ccusp->nrows = m;
4172: Ccusp->mat = Cmat;
4173: Ccusp->mat->mat = Ccsr;
4174: Ccsr->num_rows = m;
4175: Ccsr->num_cols = n;
4176: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4177: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4178: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4179: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4180: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4181: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4182: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4183: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4184: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4185: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4186: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4187: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4188: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4190: Acsr = (CsrMatrix *)Acusp->mat->mat;
4191: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4192: Annz = (PetscInt)Acsr->column_indices->size();
4193: Bnnz = (PetscInt)Bcsr->column_indices->size();
4194: c->nz = Annz + Bnnz;
4195: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4196: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4197: Ccsr->values = new THRUSTARRAY(c->nz);
4198: Ccsr->num_entries = c->nz;
4199: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4200: if (c->nz) {
4201: auto Acoo = new THRUSTINTARRAY32(Annz);
4202: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4203: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4204: THRUSTINTARRAY32 *Aroff, *Broff;
4206: if (a->compressedrow.use) { /* need full row offset */
4207: if (!Acusp->rowoffsets_gpu) {
4208: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4209: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4210: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4211: }
4212: Aroff = Acusp->rowoffsets_gpu;
4213: } else Aroff = Acsr->row_offsets;
4214: if (b->compressedrow.use) { /* need full row offset */
4215: if (!Bcusp->rowoffsets_gpu) {
4216: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4217: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4218: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4219: }
4220: Broff = Bcusp->rowoffsets_gpu;
4221: } else Broff = Bcsr->row_offsets;
4222: PetscCall(PetscLogGpuTimeBegin());
4223: PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4224: PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4225: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4226: auto Aperm = thrust::make_constant_iterator(1);
4227: auto Bperm = thrust::make_constant_iterator(0);
4228: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4229: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4230: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4231: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4232: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4233: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4234: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4235: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4236: auto p1 = Ccusp->coords->begin();
4237: auto p2 = Ccusp->coords->begin();
4238: thrust::advance(p2, Annz);
4239: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4240: auto cci = thrust::make_counting_iterator(zero);
4241: auto cce = thrust::make_counting_iterator(c->nz);
4242: #if 0 //Errors on SUMMIT cuda 11.1.0
4243: PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4244: #else
4245: auto pred = [](const int &x) { return x; };
4246: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4247: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4248: #endif
4249: PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4250: PetscCall(PetscLogGpuTimeEnd());
4251: delete wPerm;
4252: delete Acoo;
4253: delete Bcoo;
4254: delete Ccoo;
4255: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4257: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4258: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4259: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4260: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4261: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4262: CsrMatrix *CcsrT = new CsrMatrix;
4263: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4264: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4266: (*C)->form_explicit_transpose = PETSC_TRUE;
4267: (*C)->transupdated = PETSC_TRUE;
4268: Ccusp->rowoffsets_gpu = NULL;
4269: CmatT->cprowIndices = NULL;
4270: CmatT->mat = CcsrT;
4271: CcsrT->num_rows = n;
4272: CcsrT->num_cols = m;
4273: CcsrT->num_entries = c->nz;
4274: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4275: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4276: CcsrT->values = new THRUSTARRAY(c->nz);
4278: PetscCall(PetscLogGpuTimeBegin());
4279: auto rT = CcsrT->row_offsets->begin();
4280: if (AT) {
4281: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4282: thrust::advance(rT, -1);
4283: }
4284: if (BT) {
4285: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4286: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4287: thrust::copy(titb, tite, rT);
4288: }
4289: auto cT = CcsrT->column_indices->begin();
4290: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4291: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4292: auto vT = CcsrT->values->begin();
4293: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4294: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4295: PetscCall(PetscLogGpuTimeEnd());
4297: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4298: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4299: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4300: PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4301: PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4302: PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4303: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4304: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4305: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4307: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4308: Ccusp->matTranspose = CmatT;
4309: }
4310: }
4312: c->free_a = PETSC_TRUE;
4313: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4314: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4315: c->free_ij = PETSC_TRUE;
4316: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4317: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4318: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4319: ii = *Ccsr->row_offsets;
4320: jj = *Ccsr->column_indices;
4321: PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4322: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4323: } else {
4324: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4325: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4326: }
4327: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4328: PetscCall(PetscMalloc1(m, &c->ilen));
4329: PetscCall(PetscMalloc1(m, &c->imax));
4330: c->maxnz = c->nz;
4331: c->nonzerorowcnt = 0;
4332: c->rmax = 0;
4333: for (i = 0; i < m; i++) {
4334: const PetscInt nn = c->i[i + 1] - c->i[i];
4335: c->ilen[i] = c->imax[i] = nn;
4336: c->nonzerorowcnt += (PetscInt)!!nn;
4337: c->rmax = PetscMax(c->rmax, nn);
4338: }
4339: PetscCall(PetscMalloc1(c->nz, &c->a));
4340: (*C)->nonzerostate++;
4341: PetscCall(PetscLayoutSetUp((*C)->rmap));
4342: PetscCall(PetscLayoutSetUp((*C)->cmap));
4343: Ccusp->nonzerostate = (*C)->nonzerostate;
4344: (*C)->preallocated = PETSC_TRUE;
4345: } else {
4346: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4347: c = (Mat_SeqAIJ *)(*C)->data;
4348: if (c->nz) {
4349: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4350: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4351: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4352: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4353: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4354: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4355: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4356: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4357: Acsr = (CsrMatrix *)Acusp->mat->mat;
4358: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4359: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4360: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4361: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4362: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4363: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4364: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4365: auto pmid = Ccusp->coords->begin();
4366: thrust::advance(pmid, Acsr->num_entries);
4367: PetscCall(PetscLogGpuTimeBegin());
4368: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4369: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4370: thrust::for_each(zibait, zieait, VecHIPEquals());
4371: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4372: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4373: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4374: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4375: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4376: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4377: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4378: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4379: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4380: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4381: auto vT = CcsrT->values->begin();
4382: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4383: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4384: (*C)->transupdated = PETSC_TRUE;
4385: }
4386: PetscCall(PetscLogGpuTimeEnd());
4387: }
4388: }
4389: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4390: (*C)->assembled = PETSC_TRUE;
4391: (*C)->was_assembled = PETSC_FALSE;
4392: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4393: PetscFunctionReturn(PETSC_SUCCESS);
4394: }
4396: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4397: {
4398: bool dmem;
4399: const PetscScalar *av;
4401: PetscFunctionBegin;
4402: dmem = isHipMem(v);
4403: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4404: if (n && idx) {
4405: THRUSTINTARRAY widx(n);
4406: widx.assign(idx, idx + n);
4407: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4409: THRUSTARRAY *w = NULL;
4410: thrust::device_ptr<PetscScalar> dv;
4411: if (dmem) dv = thrust::device_pointer_cast(v);
4412: else {
4413: w = new THRUSTARRAY(n);
4414: dv = w->data();
4415: }
4416: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4418: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4419: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4420: thrust::for_each(zibit, zieit, VecHIPEquals());
4421: if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4422: delete w;
4423: } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4425: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4426: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4427: PetscFunctionReturn(PETSC_SUCCESS);
4428: }