Actual source code: aijhipsparse.hip.cpp
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #include <thrust/iterator/transform_iterator.h>
17: #if PETSC_CPP_VERSION >= 14
18: #define PETSC_HAVE_THRUST_ASYNC 1
19: #include <thrust/async/for_each.h>
20: #endif
21: #include <thrust/iterator/constant_iterator.h>
22: #include <thrust/iterator/discard_iterator.h>
23: #include <thrust/binary_search.h>
24: #include <thrust/remove.h>
25: #include <thrust/sort.h>
26: #include <thrust/unique.h>
28: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
33: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
44: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
69: /*
70: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71: {
72: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
74: PetscFunctionBegin;
75: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76: hipsparsestruct->stream = stream;
77: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78: PetscFunctionReturn(PETSC_SUCCESS);
79: }
81: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82: {
83: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
85: PetscFunctionBegin;
86: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87: if (hipsparsestruct->handle != handle) {
88: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89: hipsparsestruct->handle = handle;
90: }
91: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96: {
97: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98: PetscBool flg;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
106: */
108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
112: PetscFunctionBegin;
113: switch (op) {
114: case MAT_HIPSPARSE_MULT:
115: hipsparsestruct->format = format;
116: break;
117: case MAT_HIPSPARSE_ALL:
118: hipsparsestruct->format = format;
119: break;
120: default:
121: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122: }
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: /*@
127: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128: operation. Only the `MatMult()` operation can use different GPU storage formats
130: Not Collective
132: Input Parameters:
133: + A - Matrix of type `MATSEQAIJHIPSPARSE`
134: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
138: Level: intermediate
140: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
144: PetscFunctionBegin;
146: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151: {
152: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
154: PetscFunctionBegin;
155: hipsparsestruct->use_cpu_solve = use_cpu;
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: /*@
160: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
162: Input Parameters:
163: + A - Matrix of type `MATSEQAIJHIPSPARSE`
164: - use_cpu - set flag for using the built-in CPU `MatSolve()`
166: Level: intermediate
168: Notes:
169: The hipSparse LU solver currently computes the factors with the built-in CPU method
170: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
173: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
177: PetscFunctionBegin;
179: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184: {
185: PetscFunctionBegin;
186: switch (op) {
187: case MAT_FORM_EXPLICIT_TRANSPOSE:
188: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190: A->form_explicit_transpose = flg;
191: break;
192: default:
193: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194: break;
195: }
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200: {
201: PetscBool row_identity, col_identity;
202: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
203: IS isrow = b->row, iscol = b->col;
204: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
206: PetscFunctionBegin;
207: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209: B->offloadmask = PETSC_OFFLOAD_CPU;
210: /* determine which version of MatSolve needs to be used. */
211: PetscCall(ISIdentity(isrow, &row_identity));
212: PetscCall(ISIdentity(iscol, &col_identity));
213: if (!hipsparsestruct->use_cpu_solve) {
214: if (row_identity && col_identity) {
215: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217: } else {
218: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
219: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220: }
221: }
222: B->ops->matsolve = NULL;
223: B->ops->matsolvetranspose = NULL;
225: /* get the triangular factors */
226: if (!hipsparsestruct->use_cpu_solve) { PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); }
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
231: {
232: MatHIPSPARSEStorageFormat format;
233: PetscBool flg;
234: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
236: PetscFunctionBegin;
237: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238: if (A->factortype == MAT_FACTOR_NONE) {
239: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245: PetscCall(
246: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249: PetscCall(
250: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252: /*
253: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255: */
256: }
257: PetscOptionsHeadEnd();
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
264: PetscInt n = A->rmap->n;
265: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267: const PetscInt *ai = a->i, *aj = a->j, *vi;
268: const MatScalar *aa = a->a, *v;
269: PetscInt *AiLo, *AjLo;
270: PetscInt i, nz, nzLower, offset, rowOffset;
272: PetscFunctionBegin;
273: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275: try {
276: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277: nzLower = n + ai[n] - ai[1];
278: if (!loTriFactor) {
279: PetscScalar *AALo;
280: PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
282: /* Allocate Space for the lower triangular matrix */
283: PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284: PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
286: /* Fill the lower triangular matrix */
287: AiLo[0] = (PetscInt)0;
288: AiLo[n] = nzLower;
289: AjLo[0] = (PetscInt)0;
290: AALo[0] = (MatScalar)1.0;
291: v = aa;
292: vi = aj;
293: offset = 1;
294: rowOffset = 1;
295: for (i = 1; i < n; i++) {
296: nz = ai[i + 1] - ai[i];
297: /* additional 1 for the term on the diagonal */
298: AiLo[i] = rowOffset;
299: rowOffset += nz + 1;
301: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303: offset += nz;
304: AjLo[offset] = (PetscInt)i;
305: AALo[offset] = (MatScalar)1.0;
306: offset += 1;
307: v += nz;
308: vi += nz;
309: }
311: /* allocate space for the triangular factor information */
312: PetscCall(PetscNew(&loTriFactor));
313: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314: /* Create the matrix description */
315: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
321: /* set the operation */
322: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
324: /* set the matrix */
325: loTriFactor->csrMat = new CsrMatrix;
326: loTriFactor->csrMat->num_rows = n;
327: loTriFactor->csrMat->num_cols = n;
328: loTriFactor->csrMat->num_entries = nzLower;
329: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
330: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
333: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
337: /* Create the solve analysis information */
338: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
344: /* perform the solve analysis */
345: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
348: PetscCallHIP(WaitForHIP());
349: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
351: /* assign the pointer */
352: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353: loTriFactor->AA_h = AALo;
354: PetscCallHIP(hipHostFree(AiLo));
355: PetscCallHIP(hipHostFree(AjLo));
356: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357: } else { /* update values only */
358: if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359: /* Fill the lower triangular matrix */
360: loTriFactor->AA_h[0] = 1.0;
361: v = aa;
362: vi = aj;
363: offset = 1;
364: for (i = 1; i < n; i++) {
365: nz = ai[i + 1] - ai[i];
366: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367: offset += nz;
368: loTriFactor->AA_h[offset] = 1.0;
369: offset += 1;
370: v += nz;
371: }
372: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374: }
375: } catch (char *ex) {
376: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377: }
378: }
379: PetscFunctionReturn(PETSC_SUCCESS);
380: }
382: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383: {
384: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
385: PetscInt n = A->rmap->n;
386: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
389: const MatScalar *aa = a->a, *v;
390: PetscInt *AiUp, *AjUp;
391: PetscInt i, nz, nzUpper, offset;
393: PetscFunctionBegin;
394: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396: try {
397: /* next, figure out the number of nonzeros in the upper triangular matrix. */
398: nzUpper = adiag[0] - adiag[n];
399: if (!upTriFactor) {
400: PetscScalar *AAUp;
401: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403: /* Allocate Space for the upper triangular matrix */
404: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
405: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407: /* Fill the upper triangular matrix */
408: AiUp[0] = (PetscInt)0;
409: AiUp[n] = nzUpper;
410: offset = nzUpper;
411: for (i = n - 1; i >= 0; i--) {
412: v = aa + adiag[i + 1] + 1;
413: vi = aj + adiag[i + 1] + 1;
414: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
415: offset -= (nz + 1); /* decrement the offset */
417: /* first, set the diagonal elements */
418: AjUp[offset] = (PetscInt)i;
419: AAUp[offset] = (MatScalar)1. / v[nz];
420: AiUp[i] = AiUp[i + 1] - (nz + 1);
422: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
423: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
424: }
426: /* allocate space for the triangular factor information */
427: PetscCall(PetscNew(&upTriFactor));
428: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
430: /* Create the matrix description */
431: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
432: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
433: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
434: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
435: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
437: /* set the operation */
438: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
440: /* set the matrix */
441: upTriFactor->csrMat = new CsrMatrix;
442: upTriFactor->csrMat->num_rows = n;
443: upTriFactor->csrMat->num_cols = n;
444: upTriFactor->csrMat->num_entries = nzUpper;
445: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
446: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
447: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
448: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
449: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
450: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
452: /* Create the solve analysis information */
453: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
454: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
455: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
456: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
457: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
459: /* perform the solve analysis */
460: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
461: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
463: PetscCallHIP(WaitForHIP());
464: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
466: /* assign the pointer */
467: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
468: upTriFactor->AA_h = AAUp;
469: PetscCallHIP(hipHostFree(AiUp));
470: PetscCallHIP(hipHostFree(AjUp));
471: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
472: } else {
473: if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
474: /* Fill the upper triangular matrix */
475: offset = nzUpper;
476: for (i = n - 1; i >= 0; i--) {
477: v = aa + adiag[i + 1] + 1;
478: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
479: offset -= (nz + 1); /* decrement the offset */
481: /* first, set the diagonal elements */
482: upTriFactor->AA_h[offset] = 1. / v[nz];
483: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
484: }
485: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
486: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
487: }
488: } catch (char *ex) {
489: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
490: }
491: }
492: PetscFunctionReturn(PETSC_SUCCESS);
493: }
495: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
496: {
497: PetscBool row_identity, col_identity;
498: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
499: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
500: IS isrow = a->row, iscol = a->icol;
501: PetscInt n = A->rmap->n;
503: PetscFunctionBegin;
504: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
505: PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
506: PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
508: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
509: hipsparseTriFactors->nnz = a->nz;
511: A->offloadmask = PETSC_OFFLOAD_BOTH;
512: /* lower triangular indices */
513: PetscCall(ISIdentity(isrow, &row_identity));
514: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
515: const PetscInt *r;
517: PetscCall(ISGetIndices(isrow, &r));
518: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
519: hipsparseTriFactors->rpermIndices->assign(r, r + n);
520: PetscCall(ISRestoreIndices(isrow, &r));
521: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
522: }
523: /* upper triangular indices */
524: PetscCall(ISIdentity(iscol, &col_identity));
525: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
526: const PetscInt *c;
528: PetscCall(ISGetIndices(iscol, &c));
529: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
530: hipsparseTriFactors->cpermIndices->assign(c, c + n);
531: PetscCall(ISRestoreIndices(iscol, &c));
532: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
533: }
534: PetscFunctionReturn(PETSC_SUCCESS);
535: }
537: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
538: {
539: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
540: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
541: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
542: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
543: PetscInt *AiUp, *AjUp;
544: PetscScalar *AAUp;
545: PetscScalar *AALo;
546: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
547: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
548: const PetscInt *ai = b->i, *aj = b->j, *vj;
549: const MatScalar *aa = b->a, *v;
551: PetscFunctionBegin;
552: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
553: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
554: try {
555: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
556: PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
557: if (!upTriFactor && !loTriFactor) {
558: /* Allocate Space for the upper triangular matrix */
559: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
560: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
562: /* Fill the upper triangular matrix */
563: AiUp[0] = (PetscInt)0;
564: AiUp[n] = nzUpper;
565: offset = 0;
566: for (i = 0; i < n; i++) {
567: /* set the pointers */
568: v = aa + ai[i];
569: vj = aj + ai[i];
570: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
572: /* first, set the diagonal elements */
573: AjUp[offset] = (PetscInt)i;
574: AAUp[offset] = (MatScalar)1.0 / v[nz];
575: AiUp[i] = offset;
576: AALo[offset] = (MatScalar)1.0 / v[nz];
578: offset += 1;
579: if (nz > 0) {
580: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
581: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
582: for (j = offset; j < offset + nz; j++) {
583: AAUp[j] = -AAUp[j];
584: AALo[j] = AAUp[j] / v[nz];
585: }
586: offset += nz;
587: }
588: }
590: /* allocate space for the triangular factor information */
591: PetscCall(PetscNew(&upTriFactor));
592: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
594: /* Create the matrix description */
595: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
596: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
597: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
598: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
599: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
601: /* set the matrix */
602: upTriFactor->csrMat = new CsrMatrix;
603: upTriFactor->csrMat->num_rows = A->rmap->n;
604: upTriFactor->csrMat->num_cols = A->cmap->n;
605: upTriFactor->csrMat->num_entries = a->nz;
606: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
607: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
608: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
610: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
611: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
613: /* set the operation */
614: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
616: /* Create the solve analysis information */
617: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
618: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
619: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
620: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
621: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
623: /* perform the solve analysis */
624: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
625: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
627: PetscCallHIP(WaitForHIP());
628: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
630: /* assign the pointer */
631: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
633: /* allocate space for the triangular factor information */
634: PetscCall(PetscNew(&loTriFactor));
635: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
637: /* Create the matrix description */
638: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
639: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
640: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
641: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
642: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
644: /* set the operation */
645: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
647: /* set the matrix */
648: loTriFactor->csrMat = new CsrMatrix;
649: loTriFactor->csrMat->num_rows = A->rmap->n;
650: loTriFactor->csrMat->num_cols = A->cmap->n;
651: loTriFactor->csrMat->num_entries = a->nz;
652: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
653: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
654: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
655: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
656: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
657: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
659: /* Create the solve analysis information */
660: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
661: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
662: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
663: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
664: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
666: /* perform the solve analysis */
667: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
668: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
670: PetscCallHIP(WaitForHIP());
671: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
673: /* assign the pointer */
674: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
676: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
677: PetscCallHIP(hipHostFree(AiUp));
678: PetscCallHIP(hipHostFree(AjUp));
679: } else {
680: /* Fill the upper triangular matrix */
681: offset = 0;
682: for (i = 0; i < n; i++) {
683: /* set the pointers */
684: v = aa + ai[i];
685: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
687: /* first, set the diagonal elements */
688: AAUp[offset] = 1.0 / v[nz];
689: AALo[offset] = 1.0 / v[nz];
691: offset += 1;
692: if (nz > 0) {
693: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
694: for (j = offset; j < offset + nz; j++) {
695: AAUp[j] = -AAUp[j];
696: AALo[j] = AAUp[j] / v[nz];
697: }
698: offset += nz;
699: }
700: }
701: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
702: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
704: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
705: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
706: }
707: PetscCallHIP(hipHostFree(AAUp));
708: PetscCallHIP(hipHostFree(AALo));
709: } catch (char *ex) {
710: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
711: }
712: }
713: PetscFunctionReturn(PETSC_SUCCESS);
714: }
716: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
717: {
718: PetscBool perm_identity;
719: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
720: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
721: IS ip = a->row;
722: PetscInt n = A->rmap->n;
724: PetscFunctionBegin;
725: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
726: PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
727: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
728: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
730: A->offloadmask = PETSC_OFFLOAD_BOTH;
731: /* lower triangular indices */
732: PetscCall(ISIdentity(ip, &perm_identity));
733: if (!perm_identity) {
734: IS iip;
735: const PetscInt *irip, *rip;
737: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
738: PetscCall(ISGetIndices(iip, &irip));
739: PetscCall(ISGetIndices(ip, &rip));
740: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
741: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
742: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
743: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
744: PetscCall(ISRestoreIndices(iip, &irip));
745: PetscCall(ISDestroy(&iip));
746: PetscCall(ISRestoreIndices(ip, &rip));
747: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
748: }
749: PetscFunctionReturn(PETSC_SUCCESS);
750: }
752: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
753: {
754: PetscBool perm_identity;
755: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
756: IS ip = b->row;
758: PetscFunctionBegin;
759: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
760: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
761: B->offloadmask = PETSC_OFFLOAD_CPU;
762: /* determine which version of MatSolve needs to be used. */
763: PetscCall(ISIdentity(ip, &perm_identity));
764: if (perm_identity) {
765: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
766: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
767: B->ops->matsolve = NULL;
768: B->ops->matsolvetranspose = NULL;
769: } else {
770: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
771: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
772: B->ops->matsolve = NULL;
773: B->ops->matsolvetranspose = NULL;
774: }
776: /* get the triangular factors */
777: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
778: PetscFunctionReturn(PETSC_SUCCESS);
779: }
781: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
782: {
783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
784: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
785: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
786: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
787: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
788: hipsparseIndexBase_t indexBase;
789: hipsparseMatrixType_t matrixType;
790: hipsparseFillMode_t fillMode;
791: hipsparseDiagType_t diagType;
793: PetscFunctionBegin;
794: /* allocate space for the transpose of the lower triangular factor */
795: PetscCall(PetscNew(&loTriFactorT));
796: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
798: /* set the matrix descriptors of the lower triangular factor */
799: matrixType = hipsparseGetMatType(loTriFactor->descr);
800: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
801: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
802: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
804: /* Create the matrix description */
805: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
806: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
807: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
808: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
809: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
811: /* set the operation */
812: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
814: /* allocate GPU space for the CSC of the lower triangular factor*/
815: loTriFactorT->csrMat = new CsrMatrix;
816: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
817: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
818: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
819: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
820: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
821: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
823: /* compute the transpose of the lower triangular factor, i.e. the CSC */
824: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
825: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
826: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
827: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
828: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
829: PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
830: #endif
831: */
832: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
834: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
835: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
836: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
837: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
838: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
839: #else
840: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
841: #endif
843: PetscCallHIP(WaitForHIP());
844: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
846: /* Create the solve analysis information */
847: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
848: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
849: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
850: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
851: PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
853: /* perform the solve analysis */
854: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
855: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
857: PetscCallHIP(WaitForHIP());
858: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
860: /* assign the pointer */
861: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
863: /*********************************************/
864: /* Now the Transpose of the Upper Tri Factor */
865: /*********************************************/
867: /* allocate space for the transpose of the upper triangular factor */
868: PetscCall(PetscNew(&upTriFactorT));
869: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
871: /* set the matrix descriptors of the upper triangular factor */
872: matrixType = hipsparseGetMatType(upTriFactor->descr);
873: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
874: fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
875: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
877: /* Create the matrix description */
878: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
879: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
880: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
881: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
882: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
884: /* set the operation */
885: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
887: /* allocate GPU space for the CSC of the upper triangular factor*/
888: upTriFactorT->csrMat = new CsrMatrix;
889: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
890: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
891: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
892: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
893: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
894: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
896: /* compute the transpose of the upper triangular factor, i.e. the CSC */
897: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
898: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
899: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
900: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
901: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
902: PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
903: #endif
904: */
905: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
906: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
907: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
908: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
909: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
910: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
911: #else
912: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
913: #endif
915: PetscCallHIP(WaitForHIP());
916: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
918: /* Create the solve analysis information */
919: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
920: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
921: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
922: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
923: PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
925: /* perform the solve analysis */
926: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
927: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
929: PetscCallHIP(WaitForHIP());
930: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
932: /* assign the pointer */
933: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
934: PetscFunctionReturn(PETSC_SUCCESS);
935: }
937: struct PetscScalarToPetscInt {
938: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
939: };
941: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
942: {
943: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
944: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
945: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
946: hipsparseIndexBase_t indexBase;
948: PetscFunctionBegin;
949: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
950: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
951: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
952: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
953: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
954: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
955: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
956: PetscCall(PetscLogGpuTimeBegin());
957: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
958: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
959: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
960: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
961: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
962: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
963: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
965: /* set alpha and beta */
966: PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
967: PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
968: PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
969: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
970: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
971: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
973: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
974: CsrMatrix *matrixT = new CsrMatrix;
975: matstructT->mat = matrixT;
976: matrixT->num_rows = A->cmap->n;
977: matrixT->num_cols = A->rmap->n;
978: matrixT->num_entries = a->nz;
979: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
980: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
981: matrixT->values = new THRUSTARRAY(a->nz);
983: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
984: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
986: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
987: indexBase, hipsparse_scalartype));
988: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
989: CsrMatrix *temp = new CsrMatrix;
990: CsrMatrix *tempT = new CsrMatrix;
991: /* First convert HYB to CSR */
992: temp->num_rows = A->rmap->n;
993: temp->num_cols = A->cmap->n;
994: temp->num_entries = a->nz;
995: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
996: temp->column_indices = new THRUSTINTARRAY32(a->nz);
997: temp->values = new THRUSTARRAY(a->nz);
999: PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1001: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1002: tempT->num_rows = A->rmap->n;
1003: tempT->num_cols = A->cmap->n;
1004: tempT->num_entries = a->nz;
1005: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1006: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1007: tempT->values = new THRUSTARRAY(a->nz);
1009: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1010: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1012: /* Last, convert CSC to HYB */
1013: hipsparseHybMat_t hybMat;
1014: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1015: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1016: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1018: /* assign the pointer */
1019: matstructT->mat = hybMat;
1020: A->transupdated = PETSC_TRUE;
1021: /* delete temporaries */
1022: if (tempT) {
1023: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1024: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1025: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1026: delete (CsrMatrix *)tempT;
1027: }
1028: if (temp) {
1029: if (temp->values) delete (THRUSTARRAY *)temp->values;
1030: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1031: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1032: delete (CsrMatrix *)temp;
1033: }
1034: }
1035: }
1036: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1037: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1038: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1039: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1040: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1041: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1042: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1043: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1044: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1045: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1046: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1047: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1048: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1049: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1050: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1051: }
1052: if (!hipsparsestruct->csr2csc_i) {
1053: THRUSTARRAY csr2csc_a(matrix->num_entries);
1054: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1056: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1057: if (matrix->num_entries) {
1058: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1059: Need to verify this for ROCm.
1060: */
1061: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1062: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1063: } else {
1064: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1065: }
1067: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1068: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1069: }
1070: PetscCallThrust(
1071: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1072: }
1073: PetscCall(PetscLogGpuTimeEnd());
1074: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1075: /* the compressed row indices is not used for matTranspose */
1076: matstructT->cprowIndices = NULL;
1077: /* assign the pointer */
1078: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1079: A->transupdated = PETSC_TRUE;
1080: PetscFunctionReturn(PETSC_SUCCESS);
1081: }
1083: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1084: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1085: {
1086: PetscInt n = xx->map->n;
1087: const PetscScalar *barray;
1088: PetscScalar *xarray;
1089: thrust::device_ptr<const PetscScalar> bGPU;
1090: thrust::device_ptr<PetscScalar> xGPU;
1091: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1092: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1093: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1094: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1096: PetscFunctionBegin;
1097: /* Analyze the matrix and create the transpose ... on the fly */
1098: if (!loTriFactorT && !upTriFactorT) {
1099: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1100: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1101: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1102: }
1104: /* Get the GPU pointers */
1105: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1106: PetscCall(VecHIPGetArrayRead(bb, &barray));
1107: xGPU = thrust::device_pointer_cast(xarray);
1108: bGPU = thrust::device_pointer_cast(barray);
1110: PetscCall(PetscLogGpuTimeBegin());
1111: /* First, reorder with the row permutation */
1112: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1114: /* First, solve U */
1115: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1116: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1118: /* Then, solve L */
1119: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1120: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1125: /* Copy the temporary to the full solution. */
1126: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1128: /* restore */
1129: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1130: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1131: PetscCall(PetscLogGpuTimeEnd());
1132: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1133: PetscFunctionReturn(PETSC_SUCCESS);
1134: }
1136: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1137: {
1138: const PetscScalar *barray;
1139: PetscScalar *xarray;
1140: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1141: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1142: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1143: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1145: PetscFunctionBegin;
1146: /* Analyze the matrix and create the transpose ... on the fly */
1147: if (!loTriFactorT && !upTriFactorT) {
1148: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1149: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1150: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1151: }
1153: /* Get the GPU pointers */
1154: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1155: PetscCall(VecHIPGetArrayRead(bb, &barray));
1157: PetscCall(PetscLogGpuTimeBegin());
1158: /* First, solve U */
1159: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1160: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1162: /* Then, solve L */
1163: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1164: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1166: /* restore */
1167: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1168: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1169: PetscCall(PetscLogGpuTimeEnd());
1170: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1171: PetscFunctionReturn(PETSC_SUCCESS);
1172: }
1174: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1175: {
1176: const PetscScalar *barray;
1177: PetscScalar *xarray;
1178: thrust::device_ptr<const PetscScalar> bGPU;
1179: thrust::device_ptr<PetscScalar> xGPU;
1180: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1181: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1182: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1183: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1185: PetscFunctionBegin;
1186: /* Get the GPU pointers */
1187: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1188: PetscCall(VecHIPGetArrayRead(bb, &barray));
1189: xGPU = thrust::device_pointer_cast(xarray);
1190: bGPU = thrust::device_pointer_cast(barray);
1192: PetscCall(PetscLogGpuTimeBegin());
1193: /* First, reorder with the row permutation */
1194: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1196: /* Next, solve L */
1197: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1198: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1200: /* Then, solve U */
1201: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1202: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1204: /* Last, reorder with the column permutation */
1205: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1207: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1208: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1209: PetscCall(PetscLogGpuTimeEnd());
1210: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1215: {
1216: const PetscScalar *barray;
1217: PetscScalar *xarray;
1218: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1219: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1220: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1221: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1223: PetscFunctionBegin;
1224: /* Get the GPU pointers */
1225: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1226: PetscCall(VecHIPGetArrayRead(bb, &barray));
1228: PetscCall(PetscLogGpuTimeBegin());
1229: /* First, solve L */
1230: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1231: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233: /* Next, solve U */
1234: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1235: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1238: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1239: PetscCall(PetscLogGpuTimeEnd());
1240: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1241: PetscFunctionReturn(PETSC_SUCCESS);
1242: }
1244: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1245: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1246: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1247: {
1248: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1249: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1250: const PetscScalar *barray;
1251: PetscScalar *xarray;
1253: PetscFunctionBegin;
1254: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1255: PetscCall(VecHIPGetArrayRead(b, &barray));
1256: PetscCall(PetscLogGpuTimeBegin());
1258: /* Solve L*y = b */
1259: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1260: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1261: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1262: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1263: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1264: #else
1265: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1266: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1267: #endif
1268: /* Solve U*x = y */
1269: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1270: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1271: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1272: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1273: #else
1274: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1275: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1276: #endif
1277: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1278: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1280: PetscCall(PetscLogGpuTimeEnd());
1281: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1282: PetscFunctionReturn(PETSC_SUCCESS);
1283: }
1285: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1286: {
1287: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1288: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1289: const PetscScalar *barray;
1290: PetscScalar *xarray;
1292: PetscFunctionBegin;
1293: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1294: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1295: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1296: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1298: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1299: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1300: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1301: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1302: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1303: }
1305: if (!fs->updatedTransposeSpSVAnalysis) {
1306: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1308: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1309: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1310: }
1312: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1313: PetscCall(VecHIPGetArrayRead(b, &barray));
1314: PetscCall(PetscLogGpuTimeBegin());
1316: /* Solve Ut*y = b */
1317: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1318: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1319: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1320: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1321: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1322: #else
1323: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1324: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1325: #endif
1326: /* Solve Lt*x = y */
1327: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1328: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1329: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1330: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1331: #else
1332: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1333: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1334: #endif
1335: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1336: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1337: PetscCall(PetscLogGpuTimeEnd());
1338: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1339: PetscFunctionReturn(PETSC_SUCCESS);
1340: }
1342: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1343: {
1344: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1345: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1346: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1347: CsrMatrix *Acsr;
1348: PetscInt m, nz;
1349: PetscBool flg;
1351: PetscFunctionBegin;
1352: if (PetscDefined(USE_DEBUG)) {
1353: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1354: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1355: }
1357: /* Copy A's value to fact */
1358: m = fact->rmap->n;
1359: nz = aij->nz;
1360: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1361: Acsr = (CsrMatrix *)Acusp->mat->mat;
1362: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1364: /* Factorize fact inplace */
1365: if (m)
1366: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1367: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1368: if (PetscDefined(USE_DEBUG)) {
1369: int numerical_zero;
1370: hipsparseStatus_t status;
1371: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1372: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1373: }
1375: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1376: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1378: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1380: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1381: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1383: fact->offloadmask = PETSC_OFFLOAD_GPU;
1384: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1385: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1386: fact->ops->matsolve = NULL;
1387: fact->ops->matsolvetranspose = NULL;
1388: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1389: PetscFunctionReturn(PETSC_SUCCESS);
1390: }
1392: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1393: {
1394: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1395: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1396: PetscInt m, nz;
1398: PetscFunctionBegin;
1399: if (PetscDefined(USE_DEBUG)) {
1400: PetscInt i;
1401: PetscBool flg, missing;
1403: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406: PetscCall(MatMissingDiagonal(A, &missing, &i));
1407: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1408: }
1410: /* Free the old stale stuff */
1411: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1413: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414: but they will not be used. Allocate them just for easy debugging.
1415: */
1416: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1418: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1419: fact->factortype = MAT_FACTOR_ILU;
1420: fact->info.factor_mallocs = 0;
1421: fact->info.fill_ratio_given = info->fill;
1422: fact->info.fill_ratio_needed = 1.0;
1424: aij->row = NULL;
1425: aij->col = NULL;
1427: /* ====================================================================== */
1428: /* Copy A's i, j to fact and also allocate the value array of fact. */
1429: /* We'll do in-place factorization on fact */
1430: /* ====================================================================== */
1431: const int *Ai, *Aj;
1433: m = fact->rmap->n;
1434: nz = aij->nz;
1436: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1443: /* ====================================================================== */
1444: /* Create descriptors for M, L, U */
1445: /* ====================================================================== */
1446: hipsparseFillMode_t fillMode;
1447: hipsparseDiagType_t diagType;
1449: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1453: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458: */
1459: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1465: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1471: /* ========================================================================= */
1472: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1473: /* ========================================================================= */
1474: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475: if (m)
1476: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1479: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1482: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1485: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1488: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1491: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493: */
1494: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496: fs->spsvBuffer_L = fs->factBuffer_M;
1497: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498: } else {
1499: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500: fs->spsvBuffer_U = fs->factBuffer_M;
1501: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502: }
1504: /* ========================================================================== */
1505: /* Perform analysis of ilu0 on M, SpSv on L and U */
1506: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507: /* ========================================================================== */
1508: int structural_zero;
1510: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511: if (m)
1512: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514: if (PetscDefined(USE_DEBUG)) {
1515: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516: hipsparseStatus_t status;
1517: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519: }
1521: /* Estimate FLOPs of the numeric factorization */
1522: {
1523: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1524: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1525: PetscLogDouble flops = 0.0;
1527: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1528: Ai = Aseq->i;
1529: Adiag = Aseq->diag;
1530: for (PetscInt i = 0; i < m; i++) {
1531: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532: nzRow = Ai[i + 1] - Ai[i];
1533: nzLeft = Adiag[i] - Ai[i];
1534: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536: */
1537: nzLeft = (nzRow - 1) / 2;
1538: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539: }
1540: }
1541: fs->numericFactFlops = flops;
1542: }
1543: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544: PetscFunctionReturn(PETSC_SUCCESS);
1545: }
1547: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548: {
1549: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1551: const PetscScalar *barray;
1552: PetscScalar *xarray;
1554: PetscFunctionBegin;
1555: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556: PetscCall(VecHIPGetArrayRead(b, &barray));
1557: PetscCall(PetscLogGpuTimeBegin());
1559: /* Solve L*y = b */
1560: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1563: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565: #else
1566: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568: #endif
1569: /* Solve Lt*x = y */
1570: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1572: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574: #else
1575: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577: #endif
1578: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1581: PetscCall(PetscLogGpuTimeEnd());
1582: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587: {
1588: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1590: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591: CsrMatrix *Acsr;
1592: PetscInt m, nz;
1593: PetscBool flg;
1595: PetscFunctionBegin;
1596: if (PetscDefined(USE_DEBUG)) {
1597: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599: }
1601: /* Copy A's value to fact */
1602: m = fact->rmap->n;
1603: nz = aij->nz;
1604: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605: Acsr = (CsrMatrix *)Acusp->mat->mat;
1606: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1608: /* Factorize fact inplace */
1609: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613: */
1614: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615: if (PetscDefined(USE_DEBUG)) {
1616: int numerical_zero;
1617: hipsparseStatus_t status;
1618: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620: }
1622: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1624: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626: */
1627: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1629: fact->offloadmask = PETSC_OFFLOAD_GPU;
1630: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632: fact->ops->matsolve = NULL;
1633: fact->ops->matsolvetranspose = NULL;
1634: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635: PetscFunctionReturn(PETSC_SUCCESS);
1636: }
1638: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639: {
1640: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1642: PetscInt m, nz;
1644: PetscFunctionBegin;
1645: if (PetscDefined(USE_DEBUG)) {
1646: PetscInt i;
1647: PetscBool flg, missing;
1649: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1650: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1651: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1652: PetscCall(MatMissingDiagonal(A, &missing, &i));
1653: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1654: }
1656: /* Free the old stale stuff */
1657: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1659: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1660: but they will not be used. Allocate them just for easy debugging.
1661: */
1662: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1664: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1665: fact->factortype = MAT_FACTOR_ICC;
1666: fact->info.factor_mallocs = 0;
1667: fact->info.fill_ratio_given = info->fill;
1668: fact->info.fill_ratio_needed = 1.0;
1670: aij->row = NULL;
1671: aij->col = NULL;
1673: /* ====================================================================== */
1674: /* Copy A's i, j to fact and also allocate the value array of fact. */
1675: /* We'll do in-place factorization on fact */
1676: /* ====================================================================== */
1677: const int *Ai, *Aj;
1679: m = fact->rmap->n;
1680: nz = aij->nz;
1682: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1683: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1684: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1685: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1686: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1689: /* ====================================================================== */
1690: /* Create mat descriptors for M, L */
1691: /* ====================================================================== */
1692: hipsparseFillMode_t fillMode;
1693: hipsparseDiagType_t diagType;
1695: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1696: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1697: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1699: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1700: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1701: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1702: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1703: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1704: */
1705: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1706: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1707: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1708: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1709: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1711: /* ========================================================================= */
1712: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1713: /* ========================================================================= */
1714: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1715: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1717: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1718: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1720: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1721: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1723: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1724: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1726: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1727: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1729: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1730: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1731: */
1732: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1733: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1734: fs->spsvBuffer_L = fs->factBuffer_M;
1735: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1736: } else {
1737: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1738: fs->spsvBuffer_Lt = fs->factBuffer_M;
1739: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1740: }
1742: /* ========================================================================== */
1743: /* Perform analysis of ic0 on M */
1744: /* The lower triangular part of M has the same sparsity pattern as L */
1745: /* ========================================================================== */
1746: int structural_zero;
1748: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1749: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1750: if (PetscDefined(USE_DEBUG)) {
1751: hipsparseStatus_t status;
1752: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1753: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1754: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1755: }
1757: /* Estimate FLOPs of the numeric factorization */
1758: {
1759: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1760: PetscInt *Ai, nzRow, nzLeft;
1761: PetscLogDouble flops = 0.0;
1763: Ai = Aseq->i;
1764: for (PetscInt i = 0; i < m; i++) {
1765: nzRow = Ai[i + 1] - Ai[i];
1766: if (nzRow > 1) {
1767: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1768: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1769: */
1770: nzLeft = (nzRow - 1) / 2;
1771: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1772: }
1773: }
1774: fs->numericFactFlops = flops;
1775: }
1776: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1777: PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1779: #endif
1781: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1782: {
1783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1785: PetscFunctionBegin;
1786: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1787: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1788: if (hipsparseTriFactors->factorizeOnDevice) {
1789: PetscCall(ISIdentity(isrow, &row_identity));
1790: PetscCall(ISIdentity(iscol, &col_identity));
1791: }
1792: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1793: else
1794: #endif
1795: {
1796: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1797: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1798: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1799: }
1800: PetscFunctionReturn(PETSC_SUCCESS);
1801: }
1803: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1804: {
1805: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1807: PetscFunctionBegin;
1808: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1809: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1810: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1811: PetscFunctionReturn(PETSC_SUCCESS);
1812: }
1814: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1815: {
1816: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1818: PetscFunctionBegin;
1819: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1820: PetscBool perm_identity = PETSC_FALSE;
1821: if (hipsparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1822: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1823: else
1824: #endif
1825: {
1826: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1827: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1828: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1829: }
1830: PetscFunctionReturn(PETSC_SUCCESS);
1831: }
1833: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1834: {
1835: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1837: PetscFunctionBegin;
1838: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1839: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1840: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1841: PetscFunctionReturn(PETSC_SUCCESS);
1842: }
1844: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1845: {
1846: PetscFunctionBegin;
1847: *type = MATSOLVERHIPSPARSE;
1848: PetscFunctionReturn(PETSC_SUCCESS);
1849: }
1851: /*MC
1852: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1853: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1854: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1855: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1856: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1857: algorithms are not recommended. This class does NOT support direct solver operations.
1859: Level: beginner
1861: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1862: M*/
1864: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1865: {
1866: PetscInt n = A->rmap->n;
1867: PetscBool factOnDevice, factOnHost;
1868: char *prefix;
1869: char factPlace[32] = "device"; /* the default */
1871: PetscFunctionBegin;
1872: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1873: PetscCall(MatSetSizes(*B, n, n, n, n));
1874: (*B)->factortype = ftype;
1875: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1877: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1878: PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
1879: PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1880: PetscOptionsEnd();
1881: PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1882: PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1883: PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1884: ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1886: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1887: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1888: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1889: if (!A->boundtocpu) {
1890: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1891: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1892: } else {
1893: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1894: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1895: }
1896: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1897: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1898: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1899: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1900: if (!A->boundtocpu) {
1901: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1902: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1903: } else {
1904: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1905: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1906: }
1907: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1908: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1909: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1911: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1912: (*B)->canuseordering = PETSC_TRUE;
1913: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1914: PetscFunctionReturn(PETSC_SUCCESS);
1915: }
1917: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1918: {
1919: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1920: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1921: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1922: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1923: #endif
1925: PetscFunctionBegin;
1926: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1927: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1928: if (A->factortype == MAT_FACTOR_NONE) {
1929: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1930: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1931: }
1932: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1933: else if (fs->csrVal) {
1934: /* We have a factorized matrix on device and are able to copy it to host */
1935: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1936: }
1937: #endif
1938: else
1939: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1940: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1941: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1942: A->offloadmask = PETSC_OFFLOAD_BOTH;
1943: }
1944: PetscFunctionReturn(PETSC_SUCCESS);
1945: }
1947: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1948: {
1949: PetscFunctionBegin;
1950: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1951: *array = ((Mat_SeqAIJ *)A->data)->a;
1952: PetscFunctionReturn(PETSC_SUCCESS);
1953: }
1955: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1956: {
1957: PetscFunctionBegin;
1958: A->offloadmask = PETSC_OFFLOAD_CPU;
1959: *array = NULL;
1960: PetscFunctionReturn(PETSC_SUCCESS);
1961: }
1963: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1964: {
1965: PetscFunctionBegin;
1966: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1967: *array = ((Mat_SeqAIJ *)A->data)->a;
1968: PetscFunctionReturn(PETSC_SUCCESS);
1969: }
1971: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1972: {
1973: PetscFunctionBegin;
1974: *array = NULL;
1975: PetscFunctionReturn(PETSC_SUCCESS);
1976: }
1978: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1979: {
1980: PetscFunctionBegin;
1981: *array = ((Mat_SeqAIJ *)A->data)->a;
1982: PetscFunctionReturn(PETSC_SUCCESS);
1983: }
1985: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1986: {
1987: PetscFunctionBegin;
1988: A->offloadmask = PETSC_OFFLOAD_CPU;
1989: *array = NULL;
1990: PetscFunctionReturn(PETSC_SUCCESS);
1991: }
1993: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1994: {
1995: Mat_SeqAIJHIPSPARSE *cusp;
1996: CsrMatrix *matrix;
1998: PetscFunctionBegin;
1999: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2000: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2001: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
2002: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2003: matrix = (CsrMatrix *)cusp->mat->mat;
2005: if (i) {
2006: #if !defined(PETSC_USE_64BIT_INDICES)
2007: *i = matrix->row_offsets->data().get();
2008: #else
2009: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2010: #endif
2011: }
2012: if (j) {
2013: #if !defined(PETSC_USE_64BIT_INDICES)
2014: *j = matrix->column_indices->data().get();
2015: #else
2016: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2017: #endif
2018: }
2019: if (a) *a = matrix->values->data().get();
2020: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2021: PetscFunctionReturn(PETSC_SUCCESS);
2022: }
2024: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2025: {
2026: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2027: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2028: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2029: PetscBool both = PETSC_TRUE;
2030: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2032: PetscFunctionBegin;
2033: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2034: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2035: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2036: CsrMatrix *matrix;
2037: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2039: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2040: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2041: matrix->values->assign(a->a, a->a + a->nz);
2042: PetscCallHIP(WaitForHIP());
2043: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2044: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2045: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2046: } else {
2047: PetscInt nnz;
2048: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2049: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2050: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2051: delete hipsparsestruct->workVector;
2052: delete hipsparsestruct->rowoffsets_gpu;
2053: hipsparsestruct->workVector = NULL;
2054: hipsparsestruct->rowoffsets_gpu = NULL;
2055: try {
2056: if (a->compressedrow.use) {
2057: m = a->compressedrow.nrows;
2058: ii = a->compressedrow.i;
2059: ridx = a->compressedrow.rindex;
2060: } else {
2061: m = A->rmap->n;
2062: ii = a->i;
2063: ridx = NULL;
2064: }
2065: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2066: if (!a->a) {
2067: nnz = ii[m];
2068: both = PETSC_FALSE;
2069: } else nnz = a->nz;
2070: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2072: /* create hipsparse matrix */
2073: hipsparsestruct->nrows = m;
2074: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2075: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2076: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2077: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2079: PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2080: PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2081: PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2082: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2083: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2084: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2085: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2087: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2088: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2089: /* set the matrix */
2090: CsrMatrix *mat = new CsrMatrix;
2091: mat->num_rows = m;
2092: mat->num_cols = A->cmap->n;
2093: mat->num_entries = nnz;
2094: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2095: mat->column_indices = new THRUSTINTARRAY32(nnz);
2096: mat->values = new THRUSTARRAY(nnz);
2097: mat->row_offsets->assign(ii, ii + m + 1);
2098: mat->column_indices->assign(a->j, a->j + nnz);
2099: if (a->a) mat->values->assign(a->a, a->a + nnz);
2101: /* assign the pointer */
2102: matstruct->mat = mat;
2103: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2104: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2105: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2106: }
2107: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2108: CsrMatrix *mat = new CsrMatrix;
2109: mat->num_rows = m;
2110: mat->num_cols = A->cmap->n;
2111: mat->num_entries = nnz;
2112: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2113: mat->column_indices = new THRUSTINTARRAY32(nnz);
2114: mat->values = new THRUSTARRAY(nnz);
2115: mat->row_offsets->assign(ii, ii + m + 1);
2116: mat->column_indices->assign(a->j, a->j + nnz);
2117: if (a->a) mat->values->assign(a->a, a->a + nnz);
2119: hipsparseHybMat_t hybMat;
2120: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2121: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2122: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2123: /* assign the pointer */
2124: matstruct->mat = hybMat;
2126: if (mat) {
2127: if (mat->values) delete (THRUSTARRAY *)mat->values;
2128: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2129: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2130: delete (CsrMatrix *)mat;
2131: }
2132: }
2134: /* assign the compressed row indices */
2135: if (a->compressedrow.use) {
2136: hipsparsestruct->workVector = new THRUSTARRAY(m);
2137: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2138: matstruct->cprowIndices->assign(ridx, ridx + m);
2139: tmp = m;
2140: } else {
2141: hipsparsestruct->workVector = NULL;
2142: matstruct->cprowIndices = NULL;
2143: tmp = 0;
2144: }
2145: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2147: /* assign the pointer */
2148: hipsparsestruct->mat = matstruct;
2149: } catch (char *ex) {
2150: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2151: }
2152: PetscCallHIP(WaitForHIP());
2153: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2154: hipsparsestruct->nonzerostate = A->nonzerostate;
2155: }
2156: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2157: }
2158: PetscFunctionReturn(PETSC_SUCCESS);
2159: }
2161: struct VecHIPPlusEquals {
2162: template <typename Tuple>
2163: __host__ __device__ void operator()(Tuple t)
2164: {
2165: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2166: }
2167: };
2169: struct VecHIPEquals {
2170: template <typename Tuple>
2171: __host__ __device__ void operator()(Tuple t)
2172: {
2173: thrust::get<1>(t) = thrust::get<0>(t);
2174: }
2175: };
2177: struct VecHIPEqualsReverse {
2178: template <typename Tuple>
2179: __host__ __device__ void operator()(Tuple t)
2180: {
2181: thrust::get<0>(t) = thrust::get<1>(t);
2182: }
2183: };
2185: struct MatMatHipsparse {
2186: PetscBool cisdense;
2187: PetscScalar *Bt;
2188: Mat X;
2189: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2190: PetscLogDouble flops;
2191: CsrMatrix *Bcsr;
2192: hipsparseSpMatDescr_t matSpBDescr;
2193: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2194: hipsparseDnMatDescr_t matBDescr;
2195: hipsparseDnMatDescr_t matCDescr;
2196: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2197: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2198: void *dBuffer4, *dBuffer5;
2199: #endif
2200: size_t mmBufferSize;
2201: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2202: hipsparseSpGEMMDescr_t spgemmDesc;
2203: };
2205: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2206: {
2207: MatMatHipsparse *mmdata = (MatMatHipsparse *)data;
2209: PetscFunctionBegin;
2210: PetscCallHIP(hipFree(mmdata->Bt));
2211: delete mmdata->Bcsr;
2212: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2213: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2214: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2215: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2216: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2217: if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2218: if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2219: #endif
2220: if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2221: if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2222: PetscCall(MatDestroy(&mmdata->X));
2223: PetscCall(PetscFree(data));
2224: PetscFunctionReturn(PETSC_SUCCESS);
2225: }
2227: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2228: {
2229: Mat_Product *product = C->product;
2230: Mat A, B;
2231: PetscInt m, n, blda, clda;
2232: PetscBool flg, biship;
2233: Mat_SeqAIJHIPSPARSE *cusp;
2234: hipsparseOperation_t opA;
2235: const PetscScalar *barray;
2236: PetscScalar *carray;
2237: MatMatHipsparse *mmdata;
2238: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2239: CsrMatrix *csrmat;
2241: PetscFunctionBegin;
2242: MatCheckProduct(C, 1);
2243: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2244: mmdata = (MatMatHipsparse *)product->data;
2245: A = product->A;
2246: B = product->B;
2247: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2248: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2249: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2250: Instead of silently accepting the wrong answer, I prefer to raise the error */
2251: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2253: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2254: switch (product->type) {
2255: case MATPRODUCT_AB:
2256: case MATPRODUCT_PtAP:
2257: mat = cusp->mat;
2258: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2259: m = A->rmap->n;
2260: n = B->cmap->n;
2261: break;
2262: case MATPRODUCT_AtB:
2263: if (!A->form_explicit_transpose) {
2264: mat = cusp->mat;
2265: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2266: } else {
2267: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2268: mat = cusp->matTranspose;
2269: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2270: }
2271: m = A->cmap->n;
2272: n = B->cmap->n;
2273: break;
2274: case MATPRODUCT_ABt:
2275: case MATPRODUCT_RARt:
2276: mat = cusp->mat;
2277: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2278: m = A->rmap->n;
2279: n = B->rmap->n;
2280: break;
2281: default:
2282: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2283: }
2284: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2285: csrmat = (CsrMatrix *)mat->mat;
2286: /* if the user passed a CPU matrix, copy the data to the GPU */
2287: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2288: if (!biship) { PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); }
2289: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2290: PetscCall(MatDenseGetLDA(B, &blda));
2291: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2292: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2293: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2294: } else {
2295: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2296: PetscCall(MatDenseGetLDA(C, &clda));
2297: }
2299: PetscCall(PetscLogGpuTimeBegin());
2300: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2301: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2302: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2303: size_t mmBufferSize;
2304: if (mmdata->initialized && mmdata->Blda != blda) {
2305: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2306: mmdata->matBDescr = NULL;
2307: }
2308: if (!mmdata->matBDescr) {
2309: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2310: mmdata->Blda = blda;
2311: }
2312: if (mmdata->initialized && mmdata->Clda != clda) {
2313: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2314: mmdata->matCDescr = NULL;
2315: }
2316: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2317: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2318: mmdata->Clda = clda;
2319: }
2320: if (!mat->matDescr) {
2321: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2322: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2323: }
2324: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2325: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2326: PetscCallHIP(hipFree(mmdata->mmBuffer));
2327: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2328: mmdata->mmBufferSize = mmBufferSize;
2329: }
2330: mmdata->initialized = PETSC_TRUE;
2331: } else {
2332: /* to be safe, always update pointers of the mats */
2333: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2334: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2335: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2336: }
2338: /* do hipsparseSpMM, which supports transpose on B */
2339: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2341: PetscCall(PetscLogGpuTimeEnd());
2342: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2343: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2344: if (product->type == MATPRODUCT_RARt) {
2345: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2346: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2347: } else if (product->type == MATPRODUCT_PtAP) {
2348: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2349: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2350: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2351: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2352: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2353: PetscFunctionReturn(PETSC_SUCCESS);
2354: }
2356: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2357: {
2358: Mat_Product *product = C->product;
2359: Mat A, B;
2360: PetscInt m, n;
2361: PetscBool cisdense, flg;
2362: MatMatHipsparse *mmdata;
2363: Mat_SeqAIJHIPSPARSE *cusp;
2365: PetscFunctionBegin;
2366: MatCheckProduct(C, 1);
2367: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2368: A = product->A;
2369: B = product->B;
2370: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2371: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2372: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2373: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2374: switch (product->type) {
2375: case MATPRODUCT_AB:
2376: m = A->rmap->n;
2377: n = B->cmap->n;
2378: break;
2379: case MATPRODUCT_AtB:
2380: m = A->cmap->n;
2381: n = B->cmap->n;
2382: break;
2383: case MATPRODUCT_ABt:
2384: m = A->rmap->n;
2385: n = B->rmap->n;
2386: break;
2387: case MATPRODUCT_PtAP:
2388: m = B->cmap->n;
2389: n = B->cmap->n;
2390: break;
2391: case MATPRODUCT_RARt:
2392: m = B->rmap->n;
2393: n = B->rmap->n;
2394: break;
2395: default:
2396: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2397: }
2398: PetscCall(MatSetSizes(C, m, n, m, n));
2399: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2400: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2401: PetscCall(MatSetType(C, MATSEQDENSEHIP));
2403: /* product data */
2404: PetscCall(PetscNew(&mmdata));
2405: mmdata->cisdense = cisdense;
2406: /* for these products we need intermediate storage */
2407: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2408: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2409: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2410: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2411: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2412: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2413: }
2414: C->product->data = mmdata;
2415: C->product->destroy = MatDestroy_MatMatHipsparse;
2416: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2417: PetscFunctionReturn(PETSC_SUCCESS);
2418: }
2420: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2421: {
2422: Mat_Product *product = C->product;
2423: Mat A, B;
2424: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2425: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2426: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2427: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2428: PetscBool flg;
2429: MatProductType ptype;
2430: MatMatHipsparse *mmdata;
2431: hipsparseSpMatDescr_t BmatSpDescr;
2432: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2434: PetscFunctionBegin;
2435: MatCheckProduct(C, 1);
2436: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2437: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2438: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2439: mmdata = (MatMatHipsparse *)C->product->data;
2440: A = product->A;
2441: B = product->B;
2442: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2443: mmdata->reusesym = PETSC_FALSE;
2444: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2445: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2446: Cmat = Ccusp->mat;
2447: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2448: Ccsr = (CsrMatrix *)Cmat->mat;
2449: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2450: goto finalize;
2451: }
2452: if (!c->nz) goto finalize;
2453: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2454: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2455: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2456: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2457: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2458: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2459: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2460: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2461: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2462: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2463: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2464: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2465: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2466: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2468: ptype = product->type;
2469: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2470: ptype = MATPRODUCT_AB;
2471: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2472: }
2473: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2474: ptype = MATPRODUCT_AB;
2475: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2476: }
2477: switch (ptype) {
2478: case MATPRODUCT_AB:
2479: Amat = Acusp->mat;
2480: Bmat = Bcusp->mat;
2481: break;
2482: case MATPRODUCT_AtB:
2483: Amat = Acusp->matTranspose;
2484: Bmat = Bcusp->mat;
2485: break;
2486: case MATPRODUCT_ABt:
2487: Amat = Acusp->mat;
2488: Bmat = Bcusp->matTranspose;
2489: break;
2490: default:
2491: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2492: }
2493: Cmat = Ccusp->mat;
2494: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2495: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2496: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2497: Acsr = (CsrMatrix *)Amat->mat;
2498: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2499: Ccsr = (CsrMatrix *)Cmat->mat;
2500: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2501: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2502: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2503: PetscCall(PetscLogGpuTimeBegin());
2504: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2505: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2506: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2507: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2508: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2509: #else
2510: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2511: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2512: #endif
2513: #else
2514: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2515: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2516: Ccsr->column_indices->data().get()));
2517: #endif
2518: PetscCall(PetscLogGpuFlops(mmdata->flops));
2519: PetscCallHIP(WaitForHIP());
2520: PetscCall(PetscLogGpuTimeEnd());
2521: C->offloadmask = PETSC_OFFLOAD_GPU;
2522: finalize:
2523: /* shorter version of MatAssemblyEnd_SeqAIJ */
2524: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2525: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2526: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2527: c->reallocs = 0;
2528: C->info.mallocs += 0;
2529: C->info.nz_unneeded = 0;
2530: C->assembled = C->was_assembled = PETSC_TRUE;
2531: C->num_ass++;
2532: PetscFunctionReturn(PETSC_SUCCESS);
2533: }
2535: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2536: {
2537: Mat_Product *product = C->product;
2538: Mat A, B;
2539: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2540: Mat_SeqAIJ *a, *b, *c;
2541: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2542: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2543: PetscInt i, j, m, n, k;
2544: PetscBool flg;
2545: MatProductType ptype;
2546: MatMatHipsparse *mmdata;
2547: PetscLogDouble flops;
2548: PetscBool biscompressed, ciscompressed;
2549: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2550: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2551: hipsparseSpMatDescr_t BmatSpDescr;
2552: #else
2553: int cnz;
2554: #endif
2555: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2557: PetscFunctionBegin;
2558: MatCheckProduct(C, 1);
2559: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2560: A = product->A;
2561: B = product->B;
2562: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2563: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2564: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2565: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2566: a = (Mat_SeqAIJ *)A->data;
2567: b = (Mat_SeqAIJ *)B->data;
2568: /* product data */
2569: PetscCall(PetscNew(&mmdata));
2570: C->product->data = mmdata;
2571: C->product->destroy = MatDestroy_MatMatHipsparse;
2573: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2574: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2575: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2576: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2577: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2578: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2580: ptype = product->type;
2581: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2582: ptype = MATPRODUCT_AB;
2583: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2584: }
2585: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2586: ptype = MATPRODUCT_AB;
2587: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2588: }
2589: biscompressed = PETSC_FALSE;
2590: ciscompressed = PETSC_FALSE;
2591: switch (ptype) {
2592: case MATPRODUCT_AB:
2593: m = A->rmap->n;
2594: n = B->cmap->n;
2595: k = A->cmap->n;
2596: Amat = Acusp->mat;
2597: Bmat = Bcusp->mat;
2598: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2599: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2600: break;
2601: case MATPRODUCT_AtB:
2602: m = A->cmap->n;
2603: n = B->cmap->n;
2604: k = A->rmap->n;
2605: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2606: Amat = Acusp->matTranspose;
2607: Bmat = Bcusp->mat;
2608: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2609: break;
2610: case MATPRODUCT_ABt:
2611: m = A->rmap->n;
2612: n = B->rmap->n;
2613: k = A->cmap->n;
2614: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2615: Amat = Acusp->mat;
2616: Bmat = Bcusp->matTranspose;
2617: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2618: break;
2619: default:
2620: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2621: }
2623: /* create hipsparse matrix */
2624: PetscCall(MatSetSizes(C, m, n, m, n));
2625: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2626: c = (Mat_SeqAIJ *)C->data;
2627: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2628: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2629: Ccsr = new CsrMatrix;
2631: c->compressedrow.use = ciscompressed;
2632: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2633: c->compressedrow.nrows = a->compressedrow.nrows;
2634: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2635: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2636: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2637: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2638: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2639: } else {
2640: c->compressedrow.nrows = 0;
2641: c->compressedrow.i = NULL;
2642: c->compressedrow.rindex = NULL;
2643: Ccusp->workVector = NULL;
2644: Cmat->cprowIndices = NULL;
2645: }
2646: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2647: Ccusp->mat = Cmat;
2648: Ccusp->mat->mat = Ccsr;
2649: Ccsr->num_rows = Ccusp->nrows;
2650: Ccsr->num_cols = n;
2651: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2652: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2653: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2654: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2655: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2656: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2657: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2658: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2659: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2660: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2661: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2662: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2663: c->nz = 0;
2664: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2665: Ccsr->values = new THRUSTARRAY(c->nz);
2666: goto finalizesym;
2667: }
2669: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2670: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2671: Acsr = (CsrMatrix *)Amat->mat;
2672: if (!biscompressed) {
2673: Bcsr = (CsrMatrix *)Bmat->mat;
2674: BmatSpDescr = Bmat->matDescr;
2675: } else { /* we need to use row offsets for the full matrix */
2676: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2677: Bcsr = new CsrMatrix;
2678: Bcsr->num_rows = B->rmap->n;
2679: Bcsr->num_cols = cBcsr->num_cols;
2680: Bcsr->num_entries = cBcsr->num_entries;
2681: Bcsr->column_indices = cBcsr->column_indices;
2682: Bcsr->values = cBcsr->values;
2683: if (!Bcusp->rowoffsets_gpu) {
2684: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2685: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2686: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2687: }
2688: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2689: mmdata->Bcsr = Bcsr;
2690: if (Bcsr->num_rows && Bcsr->num_cols) {
2691: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2692: }
2693: BmatSpDescr = mmdata->matSpBDescr;
2694: }
2695: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2696: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2697: /* precompute flops count */
2698: if (ptype == MATPRODUCT_AB) {
2699: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2700: const PetscInt st = a->i[i];
2701: const PetscInt en = a->i[i + 1];
2702: for (j = st; j < en; j++) {
2703: const PetscInt brow = a->j[j];
2704: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2705: }
2706: }
2707: } else if (ptype == MATPRODUCT_AtB) {
2708: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2709: const PetscInt anzi = a->i[i + 1] - a->i[i];
2710: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2711: flops += (2. * anzi) * bnzi;
2712: }
2713: } else flops = 0.; /* TODO */
2715: mmdata->flops = flops;
2716: PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2718: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2719: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2720: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2721: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2722: {
2723: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2724: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2725: */
2726: void *dBuffer1 = NULL;
2727: void *dBuffer2 = NULL;
2728: void *dBuffer3 = NULL;
2729: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2730: size_t bufferSize1 = 0;
2731: size_t bufferSize2 = 0;
2732: size_t bufferSize3 = 0;
2733: size_t bufferSize4 = 0;
2734: size_t bufferSize5 = 0;
2736: /* ask bufferSize1 bytes for external memory */
2737: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2738: PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2739: /* inspect the matrices A and B to understand the memory requirement for the next step */
2740: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2742: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2743: PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2744: PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2745: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2746: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2747: PetscCallHIP(hipFree(dBuffer1));
2748: PetscCallHIP(hipFree(dBuffer2));
2750: /* get matrix C non-zero entries C_nnz1 */
2751: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2752: c->nz = (PetscInt)C_nnz1;
2753: /* allocate matrix C */
2754: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2755: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2756: Ccsr->values = new THRUSTARRAY(c->nz);
2757: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2758: /* update matC with the new pointers */
2759: if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2760: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2762: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2763: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2764: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2765: PetscCallHIP(hipFree(dBuffer3));
2766: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2767: }
2768: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2769: }
2770: #else
2771: size_t bufSize2;
2772: /* ask bufferSize bytes for external memory */
2773: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2774: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2775: /* inspect the matrices A and B to understand the memory requirement for the next step */
2776: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2777: /* ask bufferSize again bytes for external memory */
2778: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2779: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2780: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2781: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2782: is stored in the descriptor! What a messy API... */
2783: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2784: /* compute the intermediate product of A * B */
2785: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2786: /* get matrix C non-zero entries C_nnz1 */
2787: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2788: c->nz = (PetscInt)C_nnz1;
2789: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2790: mmdata->mmBufferSize / 1024));
2791: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2792: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2793: Ccsr->values = new THRUSTARRAY(c->nz);
2794: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2795: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2796: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2797: #endif
2798: #else
2799: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2800: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2801: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2802: c->nz = cnz;
2803: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2804: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2805: Ccsr->values = new THRUSTARRAY(c->nz);
2806: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2808: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2809: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2810: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2811: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2812: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2813: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2814: Ccsr->column_indices->data().get()));
2815: #endif
2816: PetscCall(PetscLogGpuFlops(mmdata->flops));
2817: PetscCall(PetscLogGpuTimeEnd());
2818: finalizesym:
2819: c->free_a = PETSC_TRUE;
2820: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2821: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2822: c->free_ij = PETSC_TRUE;
2823: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2824: PetscInt *d_i = c->i;
2825: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2826: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2827: ii = *Ccsr->row_offsets;
2828: jj = *Ccsr->column_indices;
2829: if (ciscompressed) d_i = c->compressedrow.i;
2830: PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2831: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2832: } else {
2833: PetscInt *d_i = c->i;
2834: if (ciscompressed) d_i = c->compressedrow.i;
2835: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2836: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2837: }
2838: if (ciscompressed) { /* need to expand host row offsets */
2839: PetscInt r = 0;
2840: c->i[0] = 0;
2841: for (k = 0; k < c->compressedrow.nrows; k++) {
2842: const PetscInt next = c->compressedrow.rindex[k];
2843: const PetscInt old = c->compressedrow.i[k];
2844: for (; r < next; r++) c->i[r + 1] = old;
2845: }
2846: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2847: }
2848: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2849: PetscCall(PetscMalloc1(m, &c->ilen));
2850: PetscCall(PetscMalloc1(m, &c->imax));
2851: c->maxnz = c->nz;
2852: c->nonzerorowcnt = 0;
2853: c->rmax = 0;
2854: for (k = 0; k < m; k++) {
2855: const PetscInt nn = c->i[k + 1] - c->i[k];
2856: c->ilen[k] = c->imax[k] = nn;
2857: c->nonzerorowcnt += (PetscInt)!!nn;
2858: c->rmax = PetscMax(c->rmax, nn);
2859: }
2860: PetscCall(MatMarkDiagonal_SeqAIJ(C));
2861: PetscCall(PetscMalloc1(c->nz, &c->a));
2862: Ccsr->num_entries = c->nz;
2864: C->nonzerostate++;
2865: PetscCall(PetscLayoutSetUp(C->rmap));
2866: PetscCall(PetscLayoutSetUp(C->cmap));
2867: Ccusp->nonzerostate = C->nonzerostate;
2868: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2869: C->preallocated = PETSC_TRUE;
2870: C->assembled = PETSC_FALSE;
2871: C->was_assembled = PETSC_FALSE;
2872: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2873: mmdata->reusesym = PETSC_TRUE;
2874: C->offloadmask = PETSC_OFFLOAD_GPU;
2875: }
2876: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2877: PetscFunctionReturn(PETSC_SUCCESS);
2878: }
2880: /* handles sparse or dense B */
2881: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2882: {
2883: Mat_Product *product = mat->product;
2884: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2886: PetscFunctionBegin;
2887: MatCheckProduct(mat, 1);
2888: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2889: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2890: if (product->type == MATPRODUCT_ABC) {
2891: Ciscusp = PETSC_FALSE;
2892: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2893: }
2894: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2895: PetscBool usecpu = PETSC_FALSE;
2896: switch (product->type) {
2897: case MATPRODUCT_AB:
2898: if (product->api_user) {
2899: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2900: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2901: PetscOptionsEnd();
2902: } else {
2903: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2904: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2905: PetscOptionsEnd();
2906: }
2907: break;
2908: case MATPRODUCT_AtB:
2909: if (product->api_user) {
2910: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2911: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2912: PetscOptionsEnd();
2913: } else {
2914: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2915: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2916: PetscOptionsEnd();
2917: }
2918: break;
2919: case MATPRODUCT_PtAP:
2920: if (product->api_user) {
2921: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2922: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2923: PetscOptionsEnd();
2924: } else {
2925: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2926: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2927: PetscOptionsEnd();
2928: }
2929: break;
2930: case MATPRODUCT_RARt:
2931: if (product->api_user) {
2932: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2933: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2934: PetscOptionsEnd();
2935: } else {
2936: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2937: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2938: PetscOptionsEnd();
2939: }
2940: break;
2941: case MATPRODUCT_ABC:
2942: if (product->api_user) {
2943: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2944: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2945: PetscOptionsEnd();
2946: } else {
2947: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2948: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2949: PetscOptionsEnd();
2950: }
2951: break;
2952: default:
2953: break;
2954: }
2955: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2956: }
2957: /* dispatch */
2958: if (isdense) {
2959: switch (product->type) {
2960: case MATPRODUCT_AB:
2961: case MATPRODUCT_AtB:
2962: case MATPRODUCT_ABt:
2963: case MATPRODUCT_PtAP:
2964: case MATPRODUCT_RARt:
2965: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2966: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2967: break;
2968: case MATPRODUCT_ABC:
2969: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2970: break;
2971: default:
2972: break;
2973: }
2974: } else if (Biscusp && Ciscusp) {
2975: switch (product->type) {
2976: case MATPRODUCT_AB:
2977: case MATPRODUCT_AtB:
2978: case MATPRODUCT_ABt:
2979: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2980: break;
2981: case MATPRODUCT_PtAP:
2982: case MATPRODUCT_RARt:
2983: case MATPRODUCT_ABC:
2984: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2985: break;
2986: default:
2987: break;
2988: }
2989: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2990: PetscFunctionReturn(PETSC_SUCCESS);
2991: }
2993: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2994: {
2995: PetscFunctionBegin;
2996: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2997: PetscFunctionReturn(PETSC_SUCCESS);
2998: }
3000: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3001: {
3002: PetscFunctionBegin;
3003: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3004: PetscFunctionReturn(PETSC_SUCCESS);
3005: }
3007: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3008: {
3009: PetscFunctionBegin;
3010: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3011: PetscFunctionReturn(PETSC_SUCCESS);
3012: }
3014: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3015: {
3016: PetscFunctionBegin;
3017: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3018: PetscFunctionReturn(PETSC_SUCCESS);
3019: }
3021: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3022: {
3023: PetscFunctionBegin;
3024: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3025: PetscFunctionReturn(PETSC_SUCCESS);
3026: }
3028: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3029: {
3030: int i = blockIdx.x * blockDim.x + threadIdx.x;
3031: if (i < n) y[idx[i]] += x[i];
3032: }
3034: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3035: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3036: {
3037: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3038: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3039: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3040: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3041: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3042: PetscBool compressed;
3043: PetscInt nx, ny;
3045: PetscFunctionBegin;
3046: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3047: if (!a->nz) {
3048: if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3049: else PetscCall(VecSeq_HIP::Set(zz, 0));
3050: PetscFunctionReturn(PETSC_SUCCESS);
3051: }
3052: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3053: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3054: if (!trans) {
3055: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3056: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3057: } else {
3058: if (herm || !A->form_explicit_transpose) {
3059: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3060: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3061: } else {
3062: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3063: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3064: }
3065: }
3066: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3067: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3068: try {
3069: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3070: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3071: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3073: PetscCall(PetscLogGpuTimeBegin());
3074: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3075: /* z = A x + beta y.
3076: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3077: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3078: */
3079: xptr = xarray;
3080: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3081: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3082: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3083: allocated to accommodate different uses. So we get the length info directly from mat.
3084: */
3085: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3086: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3087: nx = mat->num_cols;
3088: ny = mat->num_rows;
3089: }
3090: } else {
3091: /* z = A^T x + beta y
3092: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3093: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3094: */
3095: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3096: dptr = zarray;
3097: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3098: if (compressed) { /* Scatter x to work vector */
3099: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3100: thrust::for_each(
3101: #if PetscDefined(HAVE_THRUST_ASYNC)
3102: thrust::hip::par.on(PetscDefaultHipStream),
3103: #endif
3104: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3105: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3106: }
3107: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3108: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3109: nx = mat->num_rows;
3110: ny = mat->num_cols;
3111: }
3112: }
3113: /* csr_spmv does y = alpha op(A) x + beta y */
3114: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3115: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3116: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3117: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3118: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3119: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3120: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3121: &matstruct->hipSpMV[opA].spmvBufferSize));
3122: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3123: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3124: } else {
3125: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3126: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3127: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3128: }
3129: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3130: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3131: #else
3132: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3133: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3134: #endif
3135: } else {
3136: if (hipsparsestruct->nrows) {
3137: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3138: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3139: }
3140: }
3141: PetscCall(PetscLogGpuTimeEnd());
3143: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3144: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3145: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3146: PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
3147: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3148: PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3149: }
3150: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3151: PetscCall(VecSeq_HIP::Set(zz, 0));
3152: }
3154: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3155: if (compressed) {
3156: PetscCall(PetscLogGpuTimeBegin());
3157: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3158: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3159: prevent that. So I just add a ScatterAdd kernel.
3160: */
3161: #if 0
3162: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3163: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3164: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3165: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3166: VecHIPPlusEquals());
3167: #else
3168: PetscInt n = matstruct->cprowIndices->size();
3169: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3170: #endif
3171: PetscCall(PetscLogGpuTimeEnd());
3172: }
3173: } else {
3174: if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3175: }
3176: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3177: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3178: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3179: } catch (char *ex) {
3180: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3181: }
3182: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3183: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3184: PetscFunctionReturn(PETSC_SUCCESS);
3185: }
3187: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3188: {
3189: PetscFunctionBegin;
3190: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3191: PetscFunctionReturn(PETSC_SUCCESS);
3192: }
3194: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3195: {
3196: PetscFunctionBegin;
3197: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3198: PetscFunctionReturn(PETSC_SUCCESS);
3199: }
3201: /*@
3202: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3203: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3205: Collective
3207: Input Parameters:
3208: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3209: . m - number of rows
3210: . n - number of columns
3211: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3212: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3214: Output Parameter:
3215: . A - the matrix
3217: Level: intermediate
3219: Notes:
3220: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3221: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3222: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3224: The AIJ format (compressed row storage), is fully compatible with standard Fortran
3225: storage. That is, the stored row and column indices can begin at
3226: either one (as in Fortran) or zero.
3228: Specify the preallocated storage with either `nz` or `nnz` (not both).
3229: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3230: allocation.
3232: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3233: @*/
3234: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3235: {
3236: PetscFunctionBegin;
3237: PetscCall(MatCreate(comm, A));
3238: PetscCall(MatSetSizes(*A, m, n, m, n));
3239: PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3240: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3241: PetscFunctionReturn(PETSC_SUCCESS);
3242: }
3244: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3245: {
3246: PetscFunctionBegin;
3247: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3248: else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3249: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3250: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3251: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3252: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3253: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3254: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3255: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3256: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3257: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3258: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3259: PetscCall(MatDestroy_SeqAIJ(A));
3260: PetscFunctionReturn(PETSC_SUCCESS);
3261: }
3263: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3264: {
3265: PetscFunctionBegin;
3266: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3267: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3268: PetscFunctionReturn(PETSC_SUCCESS);
3269: }
3271: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3272: {
3273: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3274: Mat_SeqAIJHIPSPARSE *cy;
3275: Mat_SeqAIJHIPSPARSE *cx;
3276: PetscScalar *ay;
3277: const PetscScalar *ax;
3278: CsrMatrix *csry, *csrx;
3280: PetscFunctionBegin;
3281: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3282: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3283: if (X->ops->axpy != Y->ops->axpy) {
3284: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3285: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3286: PetscFunctionReturn(PETSC_SUCCESS);
3287: }
3288: /* if we are here, it means both matrices are bound to GPU */
3289: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3290: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3291: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3292: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3293: csry = (CsrMatrix *)cy->mat->mat;
3294: csrx = (CsrMatrix *)cx->mat->mat;
3295: /* see if we can turn this into a hipblas axpy */
3296: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3297: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3298: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3299: if (eq) str = SAME_NONZERO_PATTERN;
3300: }
3301: /* spgeam is buggy with one column */
3302: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3303: if (str == SUBSET_NONZERO_PATTERN) {
3304: PetscScalar b = 1.0;
3305: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3306: size_t bufferSize;
3307: void *buffer;
3308: #endif
3310: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3311: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3312: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3313: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3314: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3315: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3316: PetscCallHIP(hipMalloc(&buffer, bufferSize));
3317: PetscCall(PetscLogGpuTimeBegin());
3318: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3319: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3320: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3321: PetscCall(PetscLogGpuTimeEnd());
3322: PetscCallHIP(hipFree(buffer));
3323: #else
3324: PetscCall(PetscLogGpuTimeBegin());
3325: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3326: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3327: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3328: PetscCall(PetscLogGpuTimeEnd());
3329: #endif
3330: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3331: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3332: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3333: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3334: } else if (str == SAME_NONZERO_PATTERN) {
3335: hipblasHandle_t hipblasv2handle;
3336: PetscBLASInt one = 1, bnz = 1;
3338: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3339: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3340: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3341: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3342: PetscCall(PetscLogGpuTimeBegin());
3343: PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3344: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3345: PetscCall(PetscLogGpuTimeEnd());
3346: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3347: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3348: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3349: } else {
3350: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3351: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3352: }
3353: PetscFunctionReturn(PETSC_SUCCESS);
3354: }
3356: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3357: {
3358: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3359: PetscScalar *ay;
3360: hipblasHandle_t hipblasv2handle;
3361: PetscBLASInt one = 1, bnz = 1;
3363: PetscFunctionBegin;
3364: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3365: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3366: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3367: PetscCall(PetscLogGpuTimeBegin());
3368: PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3369: PetscCall(PetscLogGpuFlops(bnz));
3370: PetscCall(PetscLogGpuTimeEnd());
3371: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3372: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3373: PetscFunctionReturn(PETSC_SUCCESS);
3374: }
3376: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3377: {
3378: PetscBool both = PETSC_FALSE;
3379: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3381: PetscFunctionBegin;
3382: if (A->factortype == MAT_FACTOR_NONE) {
3383: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3384: if (spptr->mat) {
3385: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3386: if (matrix->values) {
3387: both = PETSC_TRUE;
3388: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3389: }
3390: }
3391: if (spptr->matTranspose) {
3392: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3393: if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3394: }
3395: }
3396: //PetscCall(MatZeroEntries_SeqAIJ(A));
3397: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3398: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3399: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3400: else A->offloadmask = PETSC_OFFLOAD_CPU;
3401: PetscFunctionReturn(PETSC_SUCCESS);
3402: }
3404: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3405: {
3406: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3408: PetscFunctionBegin;
3409: if (A->factortype != MAT_FACTOR_NONE) {
3410: A->boundtocpu = flg;
3411: PetscFunctionReturn(PETSC_SUCCESS);
3412: }
3413: if (flg) {
3414: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3416: A->ops->scale = MatScale_SeqAIJ;
3417: A->ops->axpy = MatAXPY_SeqAIJ;
3418: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3419: A->ops->mult = MatMult_SeqAIJ;
3420: A->ops->multadd = MatMultAdd_SeqAIJ;
3421: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3422: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3423: A->ops->multhermitiantranspose = NULL;
3424: A->ops->multhermitiantransposeadd = NULL;
3425: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3426: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3427: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3428: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3429: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3430: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3431: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3432: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3433: } else {
3434: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3435: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3436: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3437: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3438: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3439: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3440: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3441: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3442: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3443: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3444: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3445: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3446: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3447: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3448: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3449: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3450: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3451: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3452: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3453: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3454: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3455: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3456: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3457: }
3458: A->boundtocpu = flg;
3459: if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3460: else a->inode.use = PETSC_FALSE;
3461: PetscFunctionReturn(PETSC_SUCCESS);
3462: }
3464: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3465: {
3466: Mat B;
3468: PetscFunctionBegin;
3469: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3470: if (reuse == MAT_INITIAL_MATRIX) {
3471: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3472: } else if (reuse == MAT_REUSE_MATRIX) {
3473: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3474: }
3475: B = *newmat;
3476: PetscCall(PetscFree(B->defaultvectype));
3477: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3478: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3479: if (B->factortype == MAT_FACTOR_NONE) {
3480: Mat_SeqAIJHIPSPARSE *spptr;
3481: PetscCall(PetscNew(&spptr));
3482: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3483: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3484: spptr->format = MAT_HIPSPARSE_CSR;
3485: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3486: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3487: #else
3488: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3489: #endif
3490: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3491: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3493: B->spptr = spptr;
3494: } else {
3495: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3497: PetscCall(PetscNew(&spptr));
3498: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3499: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3500: B->spptr = spptr;
3501: }
3502: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3503: }
3504: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3505: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3506: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3507: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3508: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3509: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3511: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3512: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3513: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3514: #if defined(PETSC_HAVE_HYPRE)
3515: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3516: #endif
3517: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3518: PetscFunctionReturn(PETSC_SUCCESS);
3519: }
3521: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3522: {
3523: PetscFunctionBegin;
3524: PetscCall(MatCreate_SeqAIJ(B));
3525: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3526: PetscFunctionReturn(PETSC_SUCCESS);
3527: }
3529: /*MC
3530: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3532: A matrix type whose data resides on AMD GPUs. These matrices can be in either
3533: CSR, ELL, or Hybrid format.
3534: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3536: Options Database Keys:
3537: + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3538: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3539: Other options include ell (ellpack) or hyb (hybrid).
3540: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3541: - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3543: Level: beginner
3545: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3546: M*/
3548: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3549: {
3550: PetscFunctionBegin;
3551: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3552: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3553: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3554: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3555: PetscFunctionReturn(PETSC_SUCCESS);
3556: }
3558: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3559: {
3560: Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3562: PetscFunctionBegin;
3563: if (cusp) {
3564: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3565: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3566: delete cusp->workVector;
3567: delete cusp->rowoffsets_gpu;
3568: delete cusp->csr2csc_i;
3569: delete cusp->coords;
3570: if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3571: PetscCall(PetscFree(mat->spptr));
3572: }
3573: PetscFunctionReturn(PETSC_SUCCESS);
3574: }
3576: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3577: {
3578: PetscFunctionBegin;
3579: if (*mat) {
3580: delete (*mat)->values;
3581: delete (*mat)->column_indices;
3582: delete (*mat)->row_offsets;
3583: delete *mat;
3584: *mat = 0;
3585: }
3586: PetscFunctionReturn(PETSC_SUCCESS);
3587: }
3589: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3590: {
3591: PetscFunctionBegin;
3592: if (*trifactor) {
3593: if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3594: if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3595: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3596: if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3597: if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3598: if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3599: PetscCall(PetscFree(*trifactor));
3600: }
3601: PetscFunctionReturn(PETSC_SUCCESS);
3602: }
3604: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3605: {
3606: CsrMatrix *mat;
3608: PetscFunctionBegin;
3609: if (*matstruct) {
3610: if ((*matstruct)->mat) {
3611: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3612: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3613: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3614: } else {
3615: mat = (CsrMatrix *)(*matstruct)->mat;
3616: PetscCall(CsrMatrix_Destroy(&mat));
3617: }
3618: }
3619: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3620: delete (*matstruct)->cprowIndices;
3621: if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3622: if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3623: if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3625: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3626: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3627: for (int i = 0; i < 3; i++) {
3628: if (mdata->hipSpMV[i].initialized) {
3629: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3630: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3631: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3632: }
3633: }
3634: delete *matstruct;
3635: *matstruct = NULL;
3636: }
3637: PetscFunctionReturn(PETSC_SUCCESS);
3638: }
3640: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3641: {
3642: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3644: PetscFunctionBegin;
3645: if (fs) {
3646: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3647: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3648: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3649: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3650: delete fs->rpermIndices;
3651: delete fs->cpermIndices;
3652: delete fs->workVector;
3653: fs->rpermIndices = NULL;
3654: fs->cpermIndices = NULL;
3655: fs->workVector = NULL;
3656: fs->init_dev_prop = PETSC_FALSE;
3657: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3658: PetscCallHIP(hipFree(fs->csrRowPtr));
3659: PetscCallHIP(hipFree(fs->csrColIdx));
3660: PetscCallHIP(hipFree(fs->csrVal));
3661: PetscCallHIP(hipFree(fs->X));
3662: PetscCallHIP(hipFree(fs->Y));
3663: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3664: PetscCallHIP(hipFree(fs->spsvBuffer_L));
3665: PetscCallHIP(hipFree(fs->spsvBuffer_U));
3666: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3667: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3668: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3669: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3670: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3671: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3672: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3673: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3674: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3675: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3676: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3677: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3678: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3680: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3681: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3682: #endif
3683: }
3684: PetscFunctionReturn(PETSC_SUCCESS);
3685: }
3687: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3688: {
3689: hipsparseHandle_t handle;
3691: PetscFunctionBegin;
3692: if (*trifactors) {
3693: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3694: if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3695: PetscCall(PetscFree(*trifactors));
3696: }
3697: PetscFunctionReturn(PETSC_SUCCESS);
3698: }
3700: struct IJCompare {
3701: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3702: {
3703: if (t1.get<0>() < t2.get<0>()) return true;
3704: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3705: return false;
3706: }
3707: };
3709: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3710: {
3711: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3713: PetscFunctionBegin;
3714: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3715: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3716: if (destroy) {
3717: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3718: delete cusp->csr2csc_i;
3719: cusp->csr2csc_i = NULL;
3720: }
3721: A->transupdated = PETSC_FALSE;
3722: PetscFunctionReturn(PETSC_SUCCESS);
3723: }
3725: static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(void **data)
3726: {
3727: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
3729: PetscFunctionBegin;
3730: PetscCallHIP(hipFree(coo->perm));
3731: PetscCallHIP(hipFree(coo->jmap));
3732: PetscCall(PetscFree(coo));
3733: PetscFunctionReturn(PETSC_SUCCESS);
3734: }
3736: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3737: {
3738: PetscBool dev_ij = PETSC_FALSE;
3739: PetscMemType mtype = PETSC_MEMTYPE_HOST;
3740: PetscInt *i, *j;
3741: PetscContainer container_h;
3742: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3744: PetscFunctionBegin;
3745: PetscCall(PetscGetMemType(coo_i, &mtype));
3746: if (PetscMemTypeDevice(mtype)) {
3747: dev_ij = PETSC_TRUE;
3748: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3749: PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3750: PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3751: } else {
3752: i = coo_i;
3753: j = coo_j;
3754: }
3755: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3756: if (dev_ij) PetscCall(PetscFree2(i, j));
3757: mat->offloadmask = PETSC_OFFLOAD_CPU;
3758: // Create the GPU memory
3759: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3761: // Copy the COO struct to device
3762: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3763: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
3764: PetscCall(PetscMalloc1(1, &coo_d));
3765: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3766: PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3767: PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3768: PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3769: PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3771: // Put the COO struct in a container and then attach that to the matrix
3772: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3773: PetscFunctionReturn(PETSC_SUCCESS);
3774: }
3776: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3777: {
3778: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3779: const PetscCount grid_size = gridDim.x * blockDim.x;
3780: for (; i < nnz; i += grid_size) {
3781: PetscScalar sum = 0.0;
3782: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3783: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3784: }
3785: }
3787: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3788: {
3789: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3790: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3791: PetscCount Annz = seq->nz;
3792: PetscMemType memtype;
3793: const PetscScalar *v1 = v;
3794: PetscScalar *Aa;
3795: PetscContainer container;
3796: MatCOOStruct_SeqAIJ *coo;
3798: PetscFunctionBegin;
3799: if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3801: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3802: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
3804: PetscCall(PetscGetMemType(v, &memtype));
3805: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3806: PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3807: PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3808: }
3810: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3811: else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3813: PetscCall(PetscLogGpuTimeBegin());
3814: if (Annz) {
3815: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3816: PetscCallHIP(hipPeekAtLastError());
3817: }
3818: PetscCall(PetscLogGpuTimeEnd());
3820: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3821: else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3823: if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3824: PetscFunctionReturn(PETSC_SUCCESS);
3825: }
3827: /*@C
3828: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3830: Not Collective
3832: Input Parameters:
3833: + A - the matrix
3834: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3836: Output Parameters:
3837: + i - the CSR row pointers
3838: - j - the CSR column indices
3840: Level: developer
3842: Note:
3843: When compressed is true, the CSR structure does not contain empty rows
3845: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3846: @*/
3847: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3848: {
3849: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3850: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3851: CsrMatrix *csr;
3853: PetscFunctionBegin;
3855: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3856: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3857: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3858: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3859: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3860: csr = (CsrMatrix *)cusp->mat->mat;
3861: if (i) {
3862: if (!compressed && a->compressedrow.use) { /* need full row offset */
3863: if (!cusp->rowoffsets_gpu) {
3864: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3865: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3866: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3867: }
3868: *i = cusp->rowoffsets_gpu->data().get();
3869: } else *i = csr->row_offsets->data().get();
3870: }
3871: if (j) *j = csr->column_indices->data().get();
3872: PetscFunctionReturn(PETSC_SUCCESS);
3873: }
3875: /*@C
3876: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3878: Not Collective
3880: Input Parameters:
3881: + A - the matrix
3882: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3883: . i - the CSR row pointers
3884: - j - the CSR column indices
3886: Level: developer
3888: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3889: @*/
3890: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int *i[], const int *j[])
3891: {
3892: PetscFunctionBegin;
3894: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3895: if (i) *i = NULL;
3896: if (j) *j = NULL;
3897: PetscFunctionReturn(PETSC_SUCCESS);
3898: }
3900: /*@C
3901: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3903: Not Collective
3905: Input Parameter:
3906: . A - a `MATSEQAIJHIPSPARSE` matrix
3908: Output Parameter:
3909: . a - pointer to the device data
3911: Level: developer
3913: Note:
3914: May trigger host-device copies if the up-to-date matrix data is on host
3916: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3917: @*/
3918: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3919: {
3920: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3921: CsrMatrix *csr;
3923: PetscFunctionBegin;
3925: PetscAssertPointer(a, 2);
3926: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3927: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3928: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3929: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3930: csr = (CsrMatrix *)cusp->mat->mat;
3931: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3932: *a = csr->values->data().get();
3933: PetscFunctionReturn(PETSC_SUCCESS);
3934: }
3936: /*@C
3937: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3939: Not Collective
3941: Input Parameters:
3942: + A - a `MATSEQAIJHIPSPARSE` matrix
3943: - a - pointer to the device data
3945: Level: developer
3947: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3948: @*/
3949: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3950: {
3951: PetscFunctionBegin;
3953: PetscAssertPointer(a, 2);
3954: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3955: *a = NULL;
3956: PetscFunctionReturn(PETSC_SUCCESS);
3957: }
3959: /*@C
3960: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3962: Not Collective
3964: Input Parameter:
3965: . A - a `MATSEQAIJHIPSPARSE` matrix
3967: Output Parameter:
3968: . a - pointer to the device data
3970: Level: developer
3972: Note:
3973: May trigger host-device copies if up-to-date matrix data is on host
3975: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3976: @*/
3977: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3978: {
3979: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3980: CsrMatrix *csr;
3982: PetscFunctionBegin;
3984: PetscAssertPointer(a, 2);
3985: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3986: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3987: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3988: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3989: csr = (CsrMatrix *)cusp->mat->mat;
3990: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3991: *a = csr->values->data().get();
3992: A->offloadmask = PETSC_OFFLOAD_GPU;
3993: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3994: PetscFunctionReturn(PETSC_SUCCESS);
3995: }
3996: /*@C
3997: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3999: Not Collective
4001: Input Parameters:
4002: + A - a `MATSEQAIJHIPSPARSE` matrix
4003: - a - pointer to the device data
4005: Level: developer
4007: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4008: @*/
4009: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
4010: {
4011: PetscFunctionBegin;
4013: PetscAssertPointer(a, 2);
4014: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4015: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4016: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4017: *a = NULL;
4018: PetscFunctionReturn(PETSC_SUCCESS);
4019: }
4021: /*@C
4022: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4024: Not Collective
4026: Input Parameter:
4027: . A - a `MATSEQAIJHIPSPARSE` matrix
4029: Output Parameter:
4030: . a - pointer to the device data
4032: Level: developer
4034: Note:
4035: Does not trigger host-device copies and flags data validity on the GPU
4037: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4038: @*/
4039: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
4040: {
4041: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4042: CsrMatrix *csr;
4044: PetscFunctionBegin;
4046: PetscAssertPointer(a, 2);
4047: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4048: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4049: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4050: csr = (CsrMatrix *)cusp->mat->mat;
4051: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4052: *a = csr->values->data().get();
4053: A->offloadmask = PETSC_OFFLOAD_GPU;
4054: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4055: PetscFunctionReturn(PETSC_SUCCESS);
4056: }
4058: /*@C
4059: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4061: Not Collective
4063: Input Parameters:
4064: + A - a `MATSEQAIJHIPSPARSE` matrix
4065: - a - pointer to the device data
4067: Level: developer
4069: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4070: @*/
4071: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
4072: {
4073: PetscFunctionBegin;
4075: PetscAssertPointer(a, 2);
4076: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4077: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4078: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4079: *a = NULL;
4080: PetscFunctionReturn(PETSC_SUCCESS);
4081: }
4083: struct IJCompare4 {
4084: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4085: {
4086: if (t1.get<0>() < t2.get<0>()) return true;
4087: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4088: return false;
4089: }
4090: };
4092: struct Shift {
4093: int _shift;
4095: Shift(int shift) : _shift(shift) { }
4096: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4097: };
4099: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4100: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4101: {
4102: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4103: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4104: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4105: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4106: PetscInt Annz, Bnnz;
4107: PetscInt i, m, n, zero = 0;
4109: PetscFunctionBegin;
4112: PetscAssertPointer(C, 4);
4113: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4114: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4115: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4116: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4117: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4118: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4119: if (reuse == MAT_INITIAL_MATRIX) {
4120: m = A->rmap->n;
4121: n = A->cmap->n + B->cmap->n;
4122: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4123: PetscCall(MatSetSizes(*C, m, n, m, n));
4124: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4125: c = (Mat_SeqAIJ *)(*C)->data;
4126: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4127: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4128: Ccsr = new CsrMatrix;
4129: Cmat->cprowIndices = NULL;
4130: c->compressedrow.use = PETSC_FALSE;
4131: c->compressedrow.nrows = 0;
4132: c->compressedrow.i = NULL;
4133: c->compressedrow.rindex = NULL;
4134: Ccusp->workVector = NULL;
4135: Ccusp->nrows = m;
4136: Ccusp->mat = Cmat;
4137: Ccusp->mat->mat = Ccsr;
4138: Ccsr->num_rows = m;
4139: Ccsr->num_cols = n;
4140: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4141: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4142: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4143: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4144: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4145: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4146: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4147: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4148: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4149: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4150: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4151: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4152: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4154: Acsr = (CsrMatrix *)Acusp->mat->mat;
4155: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4156: Annz = (PetscInt)Acsr->column_indices->size();
4157: Bnnz = (PetscInt)Bcsr->column_indices->size();
4158: c->nz = Annz + Bnnz;
4159: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4160: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4161: Ccsr->values = new THRUSTARRAY(c->nz);
4162: Ccsr->num_entries = c->nz;
4163: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4164: if (c->nz) {
4165: auto Acoo = new THRUSTINTARRAY32(Annz);
4166: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4167: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4168: THRUSTINTARRAY32 *Aroff, *Broff;
4170: if (a->compressedrow.use) { /* need full row offset */
4171: if (!Acusp->rowoffsets_gpu) {
4172: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4173: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4174: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4175: }
4176: Aroff = Acusp->rowoffsets_gpu;
4177: } else Aroff = Acsr->row_offsets;
4178: if (b->compressedrow.use) { /* need full row offset */
4179: if (!Bcusp->rowoffsets_gpu) {
4180: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4181: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4182: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4183: }
4184: Broff = Bcusp->rowoffsets_gpu;
4185: } else Broff = Bcsr->row_offsets;
4186: PetscCall(PetscLogGpuTimeBegin());
4187: PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4188: PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4189: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4190: auto Aperm = thrust::make_constant_iterator(1);
4191: auto Bperm = thrust::make_constant_iterator(0);
4192: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4193: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4194: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4195: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4196: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4197: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4198: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4199: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4200: auto p1 = Ccusp->coords->begin();
4201: auto p2 = Ccusp->coords->begin();
4202: thrust::advance(p2, Annz);
4203: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4204: auto cci = thrust::make_counting_iterator(zero);
4205: auto cce = thrust::make_counting_iterator(c->nz);
4206: #if 0 //Errors on SUMMIT cuda 11.1.0
4207: PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4208: #else
4209: auto pred = thrust::identity<int>();
4210: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4211: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4212: #endif
4213: PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4214: PetscCall(PetscLogGpuTimeEnd());
4215: delete wPerm;
4216: delete Acoo;
4217: delete Bcoo;
4218: delete Ccoo;
4219: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4221: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4222: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4223: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4224: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4225: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4226: CsrMatrix *CcsrT = new CsrMatrix;
4227: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4228: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4230: (*C)->form_explicit_transpose = PETSC_TRUE;
4231: (*C)->transupdated = PETSC_TRUE;
4232: Ccusp->rowoffsets_gpu = NULL;
4233: CmatT->cprowIndices = NULL;
4234: CmatT->mat = CcsrT;
4235: CcsrT->num_rows = n;
4236: CcsrT->num_cols = m;
4237: CcsrT->num_entries = c->nz;
4238: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4239: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4240: CcsrT->values = new THRUSTARRAY(c->nz);
4242: PetscCall(PetscLogGpuTimeBegin());
4243: auto rT = CcsrT->row_offsets->begin();
4244: if (AT) {
4245: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4246: thrust::advance(rT, -1);
4247: }
4248: if (BT) {
4249: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4250: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4251: thrust::copy(titb, tite, rT);
4252: }
4253: auto cT = CcsrT->column_indices->begin();
4254: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4255: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4256: auto vT = CcsrT->values->begin();
4257: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4258: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4259: PetscCall(PetscLogGpuTimeEnd());
4261: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4262: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4263: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4264: PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4265: PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4266: PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4267: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4268: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4269: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4271: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4272: Ccusp->matTranspose = CmatT;
4273: }
4274: }
4276: c->free_a = PETSC_TRUE;
4277: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4278: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4279: c->free_ij = PETSC_TRUE;
4280: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4281: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4282: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4283: ii = *Ccsr->row_offsets;
4284: jj = *Ccsr->column_indices;
4285: PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4286: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4287: } else {
4288: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4289: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4290: }
4291: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4292: PetscCall(PetscMalloc1(m, &c->ilen));
4293: PetscCall(PetscMalloc1(m, &c->imax));
4294: c->maxnz = c->nz;
4295: c->nonzerorowcnt = 0;
4296: c->rmax = 0;
4297: for (i = 0; i < m; i++) {
4298: const PetscInt nn = c->i[i + 1] - c->i[i];
4299: c->ilen[i] = c->imax[i] = nn;
4300: c->nonzerorowcnt += (PetscInt)!!nn;
4301: c->rmax = PetscMax(c->rmax, nn);
4302: }
4303: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4304: PetscCall(PetscMalloc1(c->nz, &c->a));
4305: (*C)->nonzerostate++;
4306: PetscCall(PetscLayoutSetUp((*C)->rmap));
4307: PetscCall(PetscLayoutSetUp((*C)->cmap));
4308: Ccusp->nonzerostate = (*C)->nonzerostate;
4309: (*C)->preallocated = PETSC_TRUE;
4310: } else {
4311: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4312: c = (Mat_SeqAIJ *)(*C)->data;
4313: if (c->nz) {
4314: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4315: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4316: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4317: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4318: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4319: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4320: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4321: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4322: Acsr = (CsrMatrix *)Acusp->mat->mat;
4323: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4324: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4325: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4326: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4327: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4328: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4329: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4330: auto pmid = Ccusp->coords->begin();
4331: thrust::advance(pmid, Acsr->num_entries);
4332: PetscCall(PetscLogGpuTimeBegin());
4333: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4334: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4335: thrust::for_each(zibait, zieait, VecHIPEquals());
4336: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4337: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4338: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4339: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4340: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4341: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4342: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4343: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4344: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4345: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4346: auto vT = CcsrT->values->begin();
4347: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4348: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4349: (*C)->transupdated = PETSC_TRUE;
4350: }
4351: PetscCall(PetscLogGpuTimeEnd());
4352: }
4353: }
4354: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4355: (*C)->assembled = PETSC_TRUE;
4356: (*C)->was_assembled = PETSC_FALSE;
4357: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4358: PetscFunctionReturn(PETSC_SUCCESS);
4359: }
4361: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4362: {
4363: bool dmem;
4364: const PetscScalar *av;
4366: PetscFunctionBegin;
4367: dmem = isHipMem(v);
4368: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4369: if (n && idx) {
4370: THRUSTINTARRAY widx(n);
4371: widx.assign(idx, idx + n);
4372: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4374: THRUSTARRAY *w = NULL;
4375: thrust::device_ptr<PetscScalar> dv;
4376: if (dmem) dv = thrust::device_pointer_cast(v);
4377: else {
4378: w = new THRUSTARRAY(n);
4379: dv = w->data();
4380: }
4381: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4383: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4384: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4385: thrust::for_each(zibit, zieit, VecHIPEquals());
4386: if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4387: delete w;
4388: } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4390: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4391: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4392: PetscFunctionReturn(PETSC_SUCCESS);
4393: }