Actual source code: aijhipsparse.hip.cpp
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #include <thrust/iterator/transform_iterator.h>
17: #if PETSC_CPP_VERSION >= 14
18: #define PETSC_HAVE_THRUST_ASYNC 1
19: #include <thrust/async/for_each.h>
20: #endif
21: #include <thrust/iterator/constant_iterator.h>
22: #include <thrust/iterator/discard_iterator.h>
23: #include <thrust/binary_search.h>
24: #include <thrust/remove.h>
25: #include <thrust/sort.h>
26: #include <thrust/unique.h>
28: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
29: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
30: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
31: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
33: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
34: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
35: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
36: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
38: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
39: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat, Vec, Vec);
40: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
41: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
42: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat, Vec, Vec);
43: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
44: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
45: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
46: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
47: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
48: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
49: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
52: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
53: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
54: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **);
55: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
56: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
57: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
58: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
59: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat);
60: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
61: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
62: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat, PetscBool);
63: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
66: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
69: /*
70: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
71: {
72: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
74: PetscFunctionBegin;
75: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
76: hipsparsestruct->stream = stream;
77: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
78: PetscFunctionReturn(PETSC_SUCCESS);
79: }
81: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
82: {
83: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
85: PetscFunctionBegin;
86: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
87: if (hipsparsestruct->handle != handle) {
88: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
89: hipsparsestruct->handle = handle;
90: }
91: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
92: PetscFunctionReturn(PETSC_SUCCESS);
93: }
95: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
96: {
97: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
98: PetscBool flg;
100: PetscFunctionBegin;
101: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
102: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
103: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
106: */
108: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
109: {
110: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
112: PetscFunctionBegin;
113: switch (op) {
114: case MAT_HIPSPARSE_MULT:
115: hipsparsestruct->format = format;
116: break;
117: case MAT_HIPSPARSE_ALL:
118: hipsparsestruct->format = format;
119: break;
120: default:
121: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
122: }
123: PetscFunctionReturn(PETSC_SUCCESS);
124: }
126: /*@
127: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
128: operation. Only the `MatMult()` operation can use different GPU storage formats
130: Not Collective
132: Input Parameters:
133: + A - Matrix of type `MATSEQAIJHIPSPARSE`
134: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
135: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
136: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
138: Level: intermediate
140: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
141: @*/
142: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
143: {
144: PetscFunctionBegin;
146: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
151: {
152: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
154: PetscFunctionBegin;
155: hipsparsestruct->use_cpu_solve = use_cpu;
156: PetscFunctionReturn(PETSC_SUCCESS);
157: }
159: /*@
160: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
162: Input Parameters:
163: + A - Matrix of type `MATSEQAIJHIPSPARSE`
164: - use_cpu - set flag for using the built-in CPU `MatSolve()`
166: Level: intermediate
168: Notes:
169: The hipSparse LU solver currently computes the factors with the built-in CPU method
170: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
171: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
173: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
174: @*/
175: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
176: {
177: PetscFunctionBegin;
179: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
180: PetscFunctionReturn(PETSC_SUCCESS);
181: }
183: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
184: {
185: PetscFunctionBegin;
186: switch (op) {
187: case MAT_FORM_EXPLICIT_TRANSPOSE:
188: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
189: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
190: A->form_explicit_transpose = flg;
191: break;
192: default:
193: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
194: break;
195: }
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
200: {
201: PetscBool row_identity, col_identity;
202: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
203: IS isrow = b->row, iscol = b->col;
204: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)B->spptr;
206: PetscFunctionBegin;
207: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
208: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
209: B->offloadmask = PETSC_OFFLOAD_CPU;
210: /* determine which version of MatSolve needs to be used. */
211: PetscCall(ISIdentity(isrow, &row_identity));
212: PetscCall(ISIdentity(iscol, &col_identity));
213: if (!hipsparsestruct->use_cpu_solve) {
214: if (row_identity && col_identity) {
215: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
216: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
217: } else {
218: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
219: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
220: }
221: }
222: B->ops->matsolve = NULL;
223: B->ops->matsolvetranspose = NULL;
225: /* get the triangular factors */
226: if (!hipsparsestruct->use_cpu_solve) { PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B)); }
227: PetscFunctionReturn(PETSC_SUCCESS);
228: }
230: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
231: {
232: MatHIPSPARSEStorageFormat format;
233: PetscBool flg;
234: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
236: PetscFunctionBegin;
237: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
238: if (A->factortype == MAT_FACTOR_NONE) {
239: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
240: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
241: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
242: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
243: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
244: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
245: PetscCall(
246: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
247: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
248: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249: PetscCall(
250: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
251: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
252: /*
253: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
254: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
255: */
256: }
257: PetscOptionsHeadEnd();
258: PetscFunctionReturn(PETSC_SUCCESS);
259: }
261: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(Mat A)
262: {
263: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
264: PetscInt n = A->rmap->n;
265: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
266: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
267: const PetscInt *ai = a->i, *aj = a->j, *vi;
268: const MatScalar *aa = a->a, *v;
269: PetscInt *AiLo, *AjLo;
270: PetscInt i, nz, nzLower, offset, rowOffset;
272: PetscFunctionBegin;
273: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
274: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
275: try {
276: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
277: nzLower = n + ai[n] - ai[1];
278: if (!loTriFactor) {
279: PetscScalar *AALo;
280: PetscCallHIP(hipHostMalloc((void **)&AALo, nzLower * sizeof(PetscScalar)));
282: /* Allocate Space for the lower triangular matrix */
283: PetscCallHIP(hipHostMalloc((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
284: PetscCallHIP(hipHostMalloc((void **)&AjLo, nzLower * sizeof(PetscInt)));
286: /* Fill the lower triangular matrix */
287: AiLo[0] = (PetscInt)0;
288: AiLo[n] = nzLower;
289: AjLo[0] = (PetscInt)0;
290: AALo[0] = (MatScalar)1.0;
291: v = aa;
292: vi = aj;
293: offset = 1;
294: rowOffset = 1;
295: for (i = 1; i < n; i++) {
296: nz = ai[i + 1] - ai[i];
297: /* additional 1 for the term on the diagonal */
298: AiLo[i] = rowOffset;
299: rowOffset += nz + 1;
301: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
302: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
303: offset += nz;
304: AjLo[offset] = (PetscInt)i;
305: AALo[offset] = (MatScalar)1.0;
306: offset += 1;
307: v += nz;
308: vi += nz;
309: }
311: /* allocate space for the triangular factor information */
312: PetscCall(PetscNew(&loTriFactor));
313: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
314: /* Create the matrix description */
315: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
316: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
317: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
318: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_LOWER));
319: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
321: /* set the operation */
322: loTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
324: /* set the matrix */
325: loTriFactor->csrMat = new CsrMatrix;
326: loTriFactor->csrMat->num_rows = n;
327: loTriFactor->csrMat->num_cols = n;
328: loTriFactor->csrMat->num_entries = nzLower;
329: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
330: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
331: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
333: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
334: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
335: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
337: /* Create the solve analysis information */
338: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
339: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
340: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
341: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
342: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
344: /* perform the solve analysis */
345: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
348: PetscCallHIP(WaitForHIP());
349: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
351: /* assign the pointer */
352: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
353: loTriFactor->AA_h = AALo;
354: PetscCallHIP(hipHostFree(AiLo));
355: PetscCallHIP(hipHostFree(AjLo));
356: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
357: } else { /* update values only */
358: if (!loTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
359: /* Fill the lower triangular matrix */
360: loTriFactor->AA_h[0] = 1.0;
361: v = aa;
362: vi = aj;
363: offset = 1;
364: for (i = 1; i < n; i++) {
365: nz = ai[i + 1] - ai[i];
366: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
367: offset += nz;
368: loTriFactor->AA_h[offset] = 1.0;
369: offset += 1;
370: v += nz;
371: }
372: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
373: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
374: }
375: } catch (char *ex) {
376: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
377: }
378: }
379: PetscFunctionReturn(PETSC_SUCCESS);
380: }
382: static PetscErrorCode MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(Mat A)
383: {
384: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
385: PetscInt n = A->rmap->n;
386: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
387: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
388: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
389: const MatScalar *aa = a->a, *v;
390: PetscInt *AiUp, *AjUp;
391: PetscInt i, nz, nzUpper, offset;
393: PetscFunctionBegin;
394: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
395: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396: try {
397: /* next, figure out the number of nonzeros in the upper triangular matrix. */
398: nzUpper = adiag[0] - adiag[n];
399: if (!upTriFactor) {
400: PetscScalar *AAUp;
401: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
403: /* Allocate Space for the upper triangular matrix */
404: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
405: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
407: /* Fill the upper triangular matrix */
408: AiUp[0] = (PetscInt)0;
409: AiUp[n] = nzUpper;
410: offset = nzUpper;
411: for (i = n - 1; i >= 0; i--) {
412: v = aa + adiag[i + 1] + 1;
413: vi = aj + adiag[i + 1] + 1;
414: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
415: offset -= (nz + 1); /* decrement the offset */
417: /* first, set the diagonal elements */
418: AjUp[offset] = (PetscInt)i;
419: AAUp[offset] = (MatScalar)1. / v[nz];
420: AiUp[i] = AiUp[i + 1] - (nz + 1);
422: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
423: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
424: }
426: /* allocate space for the triangular factor information */
427: PetscCall(PetscNew(&upTriFactor));
428: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
430: /* Create the matrix description */
431: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
432: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
433: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
434: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
435: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
437: /* set the operation */
438: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
440: /* set the matrix */
441: upTriFactor->csrMat = new CsrMatrix;
442: upTriFactor->csrMat->num_rows = n;
443: upTriFactor->csrMat->num_cols = n;
444: upTriFactor->csrMat->num_entries = nzUpper;
445: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
446: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
447: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
448: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
449: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
450: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
452: /* Create the solve analysis information */
453: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
454: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
455: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
456: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
457: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
459: /* perform the solve analysis */
460: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
461: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
463: PetscCallHIP(WaitForHIP());
464: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
466: /* assign the pointer */
467: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
468: upTriFactor->AA_h = AAUp;
469: PetscCallHIP(hipHostFree(AiUp));
470: PetscCallHIP(hipHostFree(AjUp));
471: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
472: } else {
473: if (!upTriFactor->AA_h) PetscCallHIP(hipHostMalloc((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
474: /* Fill the upper triangular matrix */
475: offset = nzUpper;
476: for (i = n - 1; i >= 0; i--) {
477: v = aa + adiag[i + 1] + 1;
478: nz = adiag[i] - adiag[i + 1] - 1; /* number of elements NOT on the diagonal */
479: offset -= (nz + 1); /* decrement the offset */
481: /* first, set the diagonal elements */
482: upTriFactor->AA_h[offset] = 1. / v[nz];
483: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
484: }
485: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
486: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
487: }
488: } catch (char *ex) {
489: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
490: }
491: }
492: PetscFunctionReturn(PETSC_SUCCESS);
493: }
495: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
496: {
497: PetscBool row_identity, col_identity;
498: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
499: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
500: IS isrow = a->row, iscol = a->icol;
501: PetscInt n = A->rmap->n;
503: PetscFunctionBegin;
504: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
505: PetscCall(MatSeqAIJHIPSPARSEBuildILULowerTriMatrix(A));
506: PetscCall(MatSeqAIJHIPSPARSEBuildILUUpperTriMatrix(A));
508: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
509: hipsparseTriFactors->nnz = a->nz;
511: A->offloadmask = PETSC_OFFLOAD_BOTH;
512: /* lower triangular indices */
513: PetscCall(ISIdentity(isrow, &row_identity));
514: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
515: const PetscInt *r;
517: PetscCall(ISGetIndices(isrow, &r));
518: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
519: hipsparseTriFactors->rpermIndices->assign(r, r + n);
520: PetscCall(ISRestoreIndices(isrow, &r));
521: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
522: }
523: /* upper triangular indices */
524: PetscCall(ISIdentity(iscol, &col_identity));
525: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
526: const PetscInt *c;
528: PetscCall(ISGetIndices(iscol, &c));
529: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
530: hipsparseTriFactors->cpermIndices->assign(c, c + n);
531: PetscCall(ISRestoreIndices(iscol, &c));
532: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
533: }
534: PetscFunctionReturn(PETSC_SUCCESS);
535: }
537: static PetscErrorCode MatSeqAIJHIPSPARSEBuildICCTriMatrices(Mat A)
538: {
539: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
540: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
541: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
542: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
543: PetscInt *AiUp, *AjUp;
544: PetscScalar *AAUp;
545: PetscScalar *AALo;
546: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
547: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
548: const PetscInt *ai = b->i, *aj = b->j, *vj;
549: const MatScalar *aa = b->a, *v;
551: PetscFunctionBegin;
552: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
553: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
554: try {
555: PetscCallHIP(hipHostMalloc((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
556: PetscCallHIP(hipHostMalloc((void **)&AALo, nzUpper * sizeof(PetscScalar)));
557: if (!upTriFactor && !loTriFactor) {
558: /* Allocate Space for the upper triangular matrix */
559: PetscCallHIP(hipHostMalloc((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
560: PetscCallHIP(hipHostMalloc((void **)&AjUp, nzUpper * sizeof(PetscInt)));
562: /* Fill the upper triangular matrix */
563: AiUp[0] = (PetscInt)0;
564: AiUp[n] = nzUpper;
565: offset = 0;
566: for (i = 0; i < n; i++) {
567: /* set the pointers */
568: v = aa + ai[i];
569: vj = aj + ai[i];
570: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
572: /* first, set the diagonal elements */
573: AjUp[offset] = (PetscInt)i;
574: AAUp[offset] = (MatScalar)1.0 / v[nz];
575: AiUp[i] = offset;
576: AALo[offset] = (MatScalar)1.0 / v[nz];
578: offset += 1;
579: if (nz > 0) {
580: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
581: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
582: for (j = offset; j < offset + nz; j++) {
583: AAUp[j] = -AAUp[j];
584: AALo[j] = AAUp[j] / v[nz];
585: }
586: offset += nz;
587: }
588: }
590: /* allocate space for the triangular factor information */
591: PetscCall(PetscNew(&upTriFactor));
592: upTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
594: /* Create the matrix description */
595: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactor->descr));
596: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
597: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
598: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
599: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactor->descr, HIPSPARSE_DIAG_TYPE_UNIT));
601: /* set the matrix */
602: upTriFactor->csrMat = new CsrMatrix;
603: upTriFactor->csrMat->num_rows = A->rmap->n;
604: upTriFactor->csrMat->num_cols = A->cmap->n;
605: upTriFactor->csrMat->num_entries = a->nz;
606: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
607: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
608: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
610: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
611: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
613: /* set the operation */
614: upTriFactor->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
616: /* Create the solve analysis information */
617: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
618: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactor->solveInfo));
619: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
620: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
621: PetscCallHIP(hipMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
623: /* perform the solve analysis */
624: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
625: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
627: PetscCallHIP(WaitForHIP());
628: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
630: /* assign the pointer */
631: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
633: /* allocate space for the triangular factor information */
634: PetscCall(PetscNew(&loTriFactor));
635: loTriFactor->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
637: /* Create the matrix description */
638: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactor->descr));
639: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactor->descr, HIPSPARSE_INDEX_BASE_ZERO));
640: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactor->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
641: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactor->descr, HIPSPARSE_FILL_MODE_UPPER));
642: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactor->descr, HIPSPARSE_DIAG_TYPE_NON_UNIT));
644: /* set the operation */
645: loTriFactor->solveOp = HIPSPARSE_OPERATION_TRANSPOSE;
647: /* set the matrix */
648: loTriFactor->csrMat = new CsrMatrix;
649: loTriFactor->csrMat->num_rows = A->rmap->n;
650: loTriFactor->csrMat->num_cols = A->cmap->n;
651: loTriFactor->csrMat->num_entries = a->nz;
652: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
653: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
654: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
655: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
656: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
657: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
659: /* Create the solve analysis information */
660: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
661: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactor->solveInfo));
662: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
663: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
664: PetscCallHIP(hipMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
666: /* perform the solve analysis */
667: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
668: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
670: PetscCallHIP(WaitForHIP());
671: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
673: /* assign the pointer */
674: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
676: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
677: PetscCallHIP(hipHostFree(AiUp));
678: PetscCallHIP(hipHostFree(AjUp));
679: } else {
680: /* Fill the upper triangular matrix */
681: offset = 0;
682: for (i = 0; i < n; i++) {
683: /* set the pointers */
684: v = aa + ai[i];
685: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
687: /* first, set the diagonal elements */
688: AAUp[offset] = 1.0 / v[nz];
689: AALo[offset] = 1.0 / v[nz];
691: offset += 1;
692: if (nz > 0) {
693: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
694: for (j = offset; j < offset + nz; j++) {
695: AAUp[j] = -AAUp[j];
696: AALo[j] = AAUp[j] / v[nz];
697: }
698: offset += nz;
699: }
700: }
701: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
702: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
703: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
704: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
705: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
706: }
707: PetscCallHIP(hipHostFree(AAUp));
708: PetscCallHIP(hipHostFree(AALo));
709: } catch (char *ex) {
710: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
711: }
712: }
713: PetscFunctionReturn(PETSC_SUCCESS);
714: }
716: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
717: {
718: PetscBool perm_identity;
719: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
720: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
721: IS ip = a->row;
722: PetscInt n = A->rmap->n;
724: PetscFunctionBegin;
725: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
726: PetscCall(MatSeqAIJHIPSPARSEBuildICCTriMatrices(A));
727: if (!hipsparseTriFactors->workVector) hipsparseTriFactors->workVector = new THRUSTARRAY(n);
728: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
730: A->offloadmask = PETSC_OFFLOAD_BOTH;
731: /* lower triangular indices */
732: PetscCall(ISIdentity(ip, &perm_identity));
733: if (!perm_identity) {
734: IS iip;
735: const PetscInt *irip, *rip;
737: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
738: PetscCall(ISGetIndices(iip, &irip));
739: PetscCall(ISGetIndices(ip, &rip));
740: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
741: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
742: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
743: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
744: PetscCall(ISRestoreIndices(iip, &irip));
745: PetscCall(ISDestroy(&iip));
746: PetscCall(ISRestoreIndices(ip, &rip));
747: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
748: }
749: PetscFunctionReturn(PETSC_SUCCESS);
750: }
752: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
753: {
754: PetscBool perm_identity;
755: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
756: IS ip = b->row;
758: PetscFunctionBegin;
759: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
760: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
761: B->offloadmask = PETSC_OFFLOAD_CPU;
762: /* determine which version of MatSolve needs to be used. */
763: PetscCall(ISIdentity(ip, &perm_identity));
764: if (perm_identity) {
765: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_NaturalOrdering;
766: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering;
767: B->ops->matsolve = NULL;
768: B->ops->matsolvetranspose = NULL;
769: } else {
770: B->ops->solve = MatSolve_SeqAIJHIPSPARSE;
771: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE;
772: B->ops->matsolve = NULL;
773: B->ops->matsolvetranspose = NULL;
774: }
776: /* get the triangular factors */
777: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
778: PetscFunctionReturn(PETSC_SUCCESS);
779: }
781: static PetscErrorCode MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(Mat A)
782: {
783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
784: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
785: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
786: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT;
787: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT;
788: hipsparseIndexBase_t indexBase;
789: hipsparseMatrixType_t matrixType;
790: hipsparseFillMode_t fillMode;
791: hipsparseDiagType_t diagType;
793: PetscFunctionBegin;
794: /* allocate space for the transpose of the lower triangular factor */
795: PetscCall(PetscNew(&loTriFactorT));
796: loTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
798: /* set the matrix descriptors of the lower triangular factor */
799: matrixType = hipsparseGetMatType(loTriFactor->descr);
800: indexBase = hipsparseGetMatIndexBase(loTriFactor->descr);
801: fillMode = hipsparseGetMatFillMode(loTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
802: diagType = hipsparseGetMatDiagType(loTriFactor->descr);
804: /* Create the matrix description */
805: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&loTriFactorT->descr));
806: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(loTriFactorT->descr, indexBase));
807: PetscCallHIPSPARSE(hipsparseSetMatType(loTriFactorT->descr, matrixType));
808: PetscCallHIPSPARSE(hipsparseSetMatFillMode(loTriFactorT->descr, fillMode));
809: PetscCallHIPSPARSE(hipsparseSetMatDiagType(loTriFactorT->descr, diagType));
811: /* set the operation */
812: loTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
814: /* allocate GPU space for the CSC of the lower triangular factor*/
815: loTriFactorT->csrMat = new CsrMatrix;
816: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
817: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
818: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
819: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
820: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
821: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
823: /* compute the transpose of the lower triangular factor, i.e. the CSC */
824: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
825: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
826: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
827: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
828: loTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
829: PetscCallHIP(hipMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
830: #endif
831: */
832: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
834: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
835: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
836: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
837: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
838: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
839: #else
840: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
841: #endif
843: PetscCallHIP(WaitForHIP());
844: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
846: /* Create the solve analysis information */
847: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
848: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
849: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
850: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
851: PetscCallHIP(hipMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
853: /* perform the solve analysis */
854: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
855: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
857: PetscCallHIP(WaitForHIP());
858: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
860: /* assign the pointer */
861: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
863: /*********************************************/
864: /* Now the Transpose of the Upper Tri Factor */
865: /*********************************************/
867: /* allocate space for the transpose of the upper triangular factor */
868: PetscCall(PetscNew(&upTriFactorT));
869: upTriFactorT->solvePolicy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
871: /* set the matrix descriptors of the upper triangular factor */
872: matrixType = hipsparseGetMatType(upTriFactor->descr);
873: indexBase = hipsparseGetMatIndexBase(upTriFactor->descr);
874: fillMode = hipsparseGetMatFillMode(upTriFactor->descr) == HIPSPARSE_FILL_MODE_UPPER ? HIPSPARSE_FILL_MODE_LOWER : HIPSPARSE_FILL_MODE_UPPER;
875: diagType = hipsparseGetMatDiagType(upTriFactor->descr);
877: /* Create the matrix description */
878: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&upTriFactorT->descr));
879: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(upTriFactorT->descr, indexBase));
880: PetscCallHIPSPARSE(hipsparseSetMatType(upTriFactorT->descr, matrixType));
881: PetscCallHIPSPARSE(hipsparseSetMatFillMode(upTriFactorT->descr, fillMode));
882: PetscCallHIPSPARSE(hipsparseSetMatDiagType(upTriFactorT->descr, diagType));
884: /* set the operation */
885: upTriFactorT->solveOp = HIPSPARSE_OPERATION_NON_TRANSPOSE;
887: /* allocate GPU space for the CSC of the upper triangular factor*/
888: upTriFactorT->csrMat = new CsrMatrix;
889: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
890: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
891: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
892: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
893: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
894: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
896: /* compute the transpose of the upper triangular factor, i.e. the CSC */
897: /* Csr2cscEx2 is not implemented in ROCm-5.2.0 and is planned for implementation in hipsparse with future releases of ROCm
898: #if PETSC_PKG_HIP_VERSION_GE(5, 2, 0)
899: PetscCallHIPSPARSE(hipsparseCsr2cscEx2_bufferSize(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
900: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
901: upTriFactorT->csrMat->column_indices->data().get(), hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
902: PetscCallHIP(hipMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
903: #endif
904: */
905: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
906: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
907: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
908: #if 0 /* when Csr2cscEx2 is implemented in hipSparse PETSC_PKG_HIP_VERSION_GE(5, 2, 0)*/
909: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
910: hipsparse_scalartype, HIPSPARSE_ACTION_NUMERIC, indexBase, HIPSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
911: #else
912: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
913: #endif
915: PetscCallHIP(WaitForHIP());
916: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
918: /* Create the solve analysis information */
919: PetscCall(PetscLogEventBegin(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
920: PetscCallHIPSPARSE(hipsparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
921: PetscCallHIPSPARSE(hipsparseXcsrsv_buffsize(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
922: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
923: PetscCallHIP(hipMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
925: /* perform the solve analysis */
926: PetscCallHIPSPARSE(hipsparseXcsrsv_analysis(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
927: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
929: PetscCallHIP(WaitForHIP());
930: PetscCall(PetscLogEventEnd(MAT_HIPSPARSESolveAnalysis, A, 0, 0, 0));
932: /* assign the pointer */
933: ((Mat_SeqAIJHIPSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
934: PetscFunctionReturn(PETSC_SUCCESS);
935: }
937: struct PetscScalarToPetscInt {
938: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
939: };
941: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
942: {
943: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
944: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
945: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
946: hipsparseIndexBase_t indexBase;
948: PetscFunctionBegin;
949: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
950: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
951: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
952: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
953: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
954: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
955: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
956: PetscCall(PetscLogGpuTimeBegin());
957: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
958: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
959: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
960: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
961: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
962: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
963: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
965: /* set alpha and beta */
966: PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
967: PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
968: PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
969: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
970: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
971: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
973: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
974: CsrMatrix *matrixT = new CsrMatrix;
975: matstructT->mat = matrixT;
976: matrixT->num_rows = A->cmap->n;
977: matrixT->num_cols = A->rmap->n;
978: matrixT->num_entries = a->nz;
979: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
980: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
981: matrixT->values = new THRUSTARRAY(a->nz);
983: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
984: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
986: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
987: indexBase, hipsparse_scalartype));
988: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
989: CsrMatrix *temp = new CsrMatrix;
990: CsrMatrix *tempT = new CsrMatrix;
991: /* First convert HYB to CSR */
992: temp->num_rows = A->rmap->n;
993: temp->num_cols = A->cmap->n;
994: temp->num_entries = a->nz;
995: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
996: temp->column_indices = new THRUSTINTARRAY32(a->nz);
997: temp->values = new THRUSTARRAY(a->nz);
999: PetscCallHIPSPARSE(hipsparse_hyb2csr(hipsparsestruct->handle, matstruct->descr, (hipsparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()));
1001: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1002: tempT->num_rows = A->rmap->n;
1003: tempT->num_cols = A->cmap->n;
1004: tempT->num_entries = a->nz;
1005: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1006: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1007: tempT->values = new THRUSTARRAY(a->nz);
1009: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1010: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1012: /* Last, convert CSC to HYB */
1013: hipsparseHybMat_t hybMat;
1014: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
1015: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
1016: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition));
1018: /* assign the pointer */
1019: matstructT->mat = hybMat;
1020: A->transupdated = PETSC_TRUE;
1021: /* delete temporaries */
1022: if (tempT) {
1023: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1024: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1025: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1026: delete (CsrMatrix *)tempT;
1027: }
1028: if (temp) {
1029: if (temp->values) delete (THRUSTARRAY *)temp->values;
1030: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1031: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1032: delete (CsrMatrix *)temp;
1033: }
1034: }
1035: }
1036: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1037: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1038: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1039: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1040: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1041: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1042: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1043: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1044: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1045: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1046: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1047: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1048: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1049: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1050: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1051: }
1052: if (!hipsparsestruct->csr2csc_i) {
1053: THRUSTARRAY csr2csc_a(matrix->num_entries);
1054: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1056: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
1057: if (matrix->num_entries) {
1058: /* This routine is known to give errors with CUDA-11, but works fine with CUDA-10
1059: Need to verify this for ROCm.
1060: */
1061: PetscCallHIPSPARSE(hipsparse_csr2csc(hipsparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), hipsparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1062: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), HIPSPARSE_ACTION_NUMERIC, indexBase));
1063: } else {
1064: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1065: }
1067: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1068: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), hipsparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1069: }
1070: PetscCallThrust(
1071: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), hipsparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1072: }
1073: PetscCall(PetscLogGpuTimeEnd());
1074: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
1075: /* the compressed row indices is not used for matTranspose */
1076: matstructT->cprowIndices = NULL;
1077: /* assign the pointer */
1078: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
1079: A->transupdated = PETSC_TRUE;
1080: PetscFunctionReturn(PETSC_SUCCESS);
1081: }
1083: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = HIPSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJHIPSPARSE? */
1084: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1085: {
1086: PetscInt n = xx->map->n;
1087: const PetscScalar *barray;
1088: PetscScalar *xarray;
1089: thrust::device_ptr<const PetscScalar> bGPU;
1090: thrust::device_ptr<PetscScalar> xGPU;
1091: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1092: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1093: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1094: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1096: PetscFunctionBegin;
1097: /* Analyze the matrix and create the transpose ... on the fly */
1098: if (!loTriFactorT && !upTriFactorT) {
1099: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1100: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1101: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1102: }
1104: /* Get the GPU pointers */
1105: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1106: PetscCall(VecHIPGetArrayRead(bb, &barray));
1107: xGPU = thrust::device_pointer_cast(xarray);
1108: bGPU = thrust::device_pointer_cast(barray);
1110: PetscCall(PetscLogGpuTimeBegin());
1111: /* First, reorder with the row permutation */
1112: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, hipsparseTriFactors->rpermIndices->end()), xGPU);
1114: /* First, solve U */
1115: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1116: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1118: /* Then, solve L */
1119: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1120: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1123: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(xGPU, hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, hipsparseTriFactors->cpermIndices->end()), tempGPU->begin());
1125: /* Copy the temporary to the full solution. */
1126: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), tempGPU->begin(), tempGPU->end(), xGPU);
1128: /* restore */
1129: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1130: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1131: PetscCall(PetscLogGpuTimeEnd());
1132: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1133: PetscFunctionReturn(PETSC_SUCCESS);
1134: }
1136: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1137: {
1138: const PetscScalar *barray;
1139: PetscScalar *xarray;
1140: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1141: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1142: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1143: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1145: PetscFunctionBegin;
1146: /* Analyze the matrix and create the transpose ... on the fly */
1147: if (!loTriFactorT && !upTriFactorT) {
1148: PetscCall(MatSeqAIJHIPSPARSEAnalyzeTransposeForSolve(A));
1149: loTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtrTranspose;
1150: upTriFactorT = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtrTranspose;
1151: }
1153: /* Get the GPU pointers */
1154: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1155: PetscCall(VecHIPGetArrayRead(bb, &barray));
1157: PetscCall(PetscLogGpuTimeBegin());
1158: /* First, solve U */
1159: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1160: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1162: /* Then, solve L */
1163: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1164: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1166: /* restore */
1167: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1168: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1169: PetscCall(PetscLogGpuTimeEnd());
1170: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1171: PetscFunctionReturn(PETSC_SUCCESS);
1172: }
1174: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE(Mat A, Vec bb, Vec xx)
1175: {
1176: const PetscScalar *barray;
1177: PetscScalar *xarray;
1178: thrust::device_ptr<const PetscScalar> bGPU;
1179: thrust::device_ptr<PetscScalar> xGPU;
1180: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1181: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1182: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1183: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1185: PetscFunctionBegin;
1186: /* Get the GPU pointers */
1187: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1188: PetscCall(VecHIPGetArrayRead(bb, &barray));
1189: xGPU = thrust::device_pointer_cast(xarray);
1190: bGPU = thrust::device_pointer_cast(barray);
1192: PetscCall(PetscLogGpuTimeBegin());
1193: /* First, reorder with the row permutation */
1194: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, hipsparseTriFactors->rpermIndices->end()), tempGPU->begin());
1196: /* Next, solve L */
1197: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1198: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1200: /* Then, solve U */
1201: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1202: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1204: /* Last, reorder with the column permutation */
1205: thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), hipsparseTriFactors->cpermIndices->end()), xGPU);
1207: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1208: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1209: PetscCall(PetscLogGpuTimeEnd());
1210: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1215: {
1216: const PetscScalar *barray;
1217: PetscScalar *xarray;
1218: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1219: Mat_SeqAIJHIPSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->loTriFactorPtr;
1220: Mat_SeqAIJHIPSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJHIPSPARSETriFactorStruct *)hipsparseTriFactors->upTriFactorPtr;
1221: THRUSTARRAY *tempGPU = (THRUSTARRAY *)hipsparseTriFactors->workVector;
1223: PetscFunctionBegin;
1224: /* Get the GPU pointers */
1225: PetscCall(VecHIPGetArrayWrite(xx, &xarray));
1226: PetscCall(VecHIPGetArrayRead(bb, &barray));
1228: PetscCall(PetscLogGpuTimeBegin());
1229: /* First, solve L */
1230: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1231: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1233: /* Next, solve U */
1234: PetscCallHIPSPARSE(hipsparseXcsrsv_solve(hipsparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_HIPSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1235: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1237: PetscCall(VecHIPRestoreArrayRead(bb, &barray));
1238: PetscCall(VecHIPRestoreArrayWrite(xx, &xarray));
1239: PetscCall(PetscLogGpuTimeEnd());
1240: PetscCall(PetscLogGpuFlops(2.0 * hipsparseTriFactors->nnz - A->cmap->n));
1241: PetscFunctionReturn(PETSC_SUCCESS);
1242: }
1244: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1245: /* hipsparseSpSV_solve() and related functions first appeared in ROCm-4.5.0*/
1246: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1247: {
1248: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1249: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1250: const PetscScalar *barray;
1251: PetscScalar *xarray;
1253: PetscFunctionBegin;
1254: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1255: PetscCall(VecHIPGetArrayRead(b, &barray));
1256: PetscCall(PetscLogGpuTimeBegin());
1258: /* Solve L*y = b */
1259: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1260: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1261: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1262: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1263: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1264: #else
1265: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1266: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); // hipsparseSpSV_solve() secretely uses the external buffer used in hipsparseSpSV_analysis()!
1267: #endif
1268: /* Solve U*x = y */
1269: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1270: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1271: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1272: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1273: #else
1274: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1275: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1276: #endif
1277: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1278: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1280: PetscCall(PetscLogGpuTimeEnd());
1281: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1282: PetscFunctionReturn(PETSC_SUCCESS);
1283: }
1285: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_ILU0(Mat fact, Vec b, Vec x)
1286: {
1287: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1288: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1289: const PetscScalar *barray;
1290: PetscScalar *xarray;
1292: PetscFunctionBegin;
1293: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1294: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1295: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1296: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1298: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
1299: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1300: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1301: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1302: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1303: }
1305: if (!fs->updatedTransposeSpSVAnalysis) {
1306: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1308: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1309: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1310: }
1312: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1313: PetscCall(VecHIPGetArrayRead(b, &barray));
1314: PetscCall(PetscLogGpuTimeBegin());
1316: /* Solve Ut*y = b */
1317: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1318: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1319: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1320: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1321: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1322: #else
1323: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1324: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1325: #endif
1326: /* Solve Lt*x = y */
1327: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1328: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1329: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1330: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1331: #else
1332: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1333: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1334: #endif
1335: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1336: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1337: PetscCall(PetscLogGpuTimeEnd());
1338: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1339: PetscFunctionReturn(PETSC_SUCCESS);
1340: }
1342: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1343: {
1344: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1345: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1346: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1347: CsrMatrix *Acsr;
1348: PetscInt m, nz;
1349: PetscBool flg;
1351: PetscFunctionBegin;
1352: if (PetscDefined(USE_DEBUG)) {
1353: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1354: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1355: }
1357: /* Copy A's value to fact */
1358: m = fact->rmap->n;
1359: nz = aij->nz;
1360: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1361: Acsr = (CsrMatrix *)Acusp->mat->mat;
1362: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1364: /* Factorize fact inplace */
1365: if (m)
1366: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1367: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1368: if (PetscDefined(USE_DEBUG)) {
1369: int numerical_zero;
1370: hipsparseStatus_t status;
1371: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1372: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1373: }
1375: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02() */
1376: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1378: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1380: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
1381: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1383: fact->offloadmask = PETSC_OFFLOAD_GPU;
1384: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ILU0;
1385: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_ILU0;
1386: fact->ops->matsolve = NULL;
1387: fact->ops->matsolvetranspose = NULL;
1388: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1389: PetscFunctionReturn(PETSC_SUCCESS);
1390: }
1392: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1393: {
1394: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1395: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1396: PetscInt m, nz;
1398: PetscFunctionBegin;
1399: if (PetscDefined(USE_DEBUG)) {
1400: PetscInt i;
1401: PetscBool flg, missing;
1403: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1404: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1405: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1406: PetscCall(MatMissingDiagonal(A, &missing, &i));
1407: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1408: }
1410: /* Free the old stale stuff */
1411: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1413: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1414: but they will not be used. Allocate them just for easy debugging.
1415: */
1416: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1418: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1419: fact->factortype = MAT_FACTOR_ILU;
1420: fact->info.factor_mallocs = 0;
1421: fact->info.fill_ratio_given = info->fill;
1422: fact->info.fill_ratio_needed = 1.0;
1424: aij->row = NULL;
1425: aij->col = NULL;
1427: /* ====================================================================== */
1428: /* Copy A's i, j to fact and also allocate the value array of fact. */
1429: /* We'll do in-place factorization on fact */
1430: /* ====================================================================== */
1431: const int *Ai, *Aj;
1433: m = fact->rmap->n;
1434: nz = aij->nz;
1436: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1437: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1438: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1439: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1440: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1441: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1443: /* ====================================================================== */
1444: /* Create descriptors for M, L, U */
1445: /* ====================================================================== */
1446: hipsparseFillMode_t fillMode;
1447: hipsparseDiagType_t diagType;
1449: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1450: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1451: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1453: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1454: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1455: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1456: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1457: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1458: */
1459: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1460: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
1461: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1462: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1463: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1465: fillMode = HIPSPARSE_FILL_MODE_UPPER;
1466: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1467: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1468: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1469: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1471: /* ========================================================================= */
1472: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1473: /* ========================================================================= */
1474: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
1475: if (m)
1476: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1477: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1479: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1480: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1482: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1483: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1485: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1486: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1488: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
1489: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1491: /* It appears spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1492: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1493: */
1494: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1495: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1496: fs->spsvBuffer_L = fs->factBuffer_M;
1497: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1498: } else {
1499: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1500: fs->spsvBuffer_U = fs->factBuffer_M;
1501: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1502: }
1504: /* ========================================================================== */
1505: /* Perform analysis of ilu0 on M, SpSv on L and U */
1506: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1507: /* ========================================================================== */
1508: int structural_zero;
1510: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1511: if (m)
1512: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
1513: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1514: if (PetscDefined(USE_DEBUG)) {
1515: /* Function hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1516: hipsparseStatus_t status;
1517: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1518: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1519: }
1521: /* Estimate FLOPs of the numeric factorization */
1522: {
1523: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1524: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1525: PetscLogDouble flops = 0.0;
1527: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1528: Ai = Aseq->i;
1529: Adiag = Aseq->diag;
1530: for (PetscInt i = 0; i < m; i++) {
1531: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1532: nzRow = Ai[i + 1] - Ai[i];
1533: nzLeft = Adiag[i] - Ai[i];
1534: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1535: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1536: */
1537: nzLeft = (nzRow - 1) / 2;
1538: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1539: }
1540: }
1541: fs->numericFactFlops = flops;
1542: }
1543: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1544: PetscFunctionReturn(PETSC_SUCCESS);
1545: }
1547: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1548: {
1549: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1550: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1551: const PetscScalar *barray;
1552: PetscScalar *xarray;
1554: PetscFunctionBegin;
1555: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1556: PetscCall(VecHIPGetArrayRead(b, &barray));
1557: PetscCall(PetscLogGpuTimeBegin());
1559: /* Solve L*y = b */
1560: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1561: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1562: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1563: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1564: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1565: #else
1566: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1567: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1568: #endif
1569: /* Solve Lt*x = y */
1570: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1571: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0)
1572: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1573: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1574: #else
1575: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1576: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1577: #endif
1578: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1579: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1581: PetscCall(PetscLogGpuTimeEnd());
1582: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1583: PetscFunctionReturn(PETSC_SUCCESS);
1584: }
1586: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1587: {
1588: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1589: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1590: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1591: CsrMatrix *Acsr;
1592: PetscInt m, nz;
1593: PetscBool flg;
1595: PetscFunctionBegin;
1596: if (PetscDefined(USE_DEBUG)) {
1597: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1598: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1599: }
1601: /* Copy A's value to fact */
1602: m = fact->rmap->n;
1603: nz = aij->nz;
1604: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1605: Acsr = (CsrMatrix *)Acusp->mat->mat;
1606: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1608: /* Factorize fact inplace */
1609: /* Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1610: The matrix type must be HIPSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1611: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1612: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1613: */
1614: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1615: if (PetscDefined(USE_DEBUG)) {
1616: int numerical_zero;
1617: hipsparseStatus_t status;
1618: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1619: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1620: }
1622: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1624: /* Note that hipsparse reports this error if we use double and HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1625: ** On entry to hipsparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1626: */
1627: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1629: fact->offloadmask = PETSC_OFFLOAD_GPU;
1630: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1631: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1632: fact->ops->matsolve = NULL;
1633: fact->ops->matsolvetranspose = NULL;
1634: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1635: PetscFunctionReturn(PETSC_SUCCESS);
1636: }
1638: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1639: {
1640: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1641: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1642: PetscInt m, nz;
1644: PetscFunctionBegin;
1645: if (PetscDefined(USE_DEBUG)) {
1646: PetscInt i;
1647: PetscBool flg, missing;
1649: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1650: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1651: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1652: PetscCall(MatMissingDiagonal(A, &missing, &i));
1653: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1654: }
1656: /* Free the old stale stuff */
1657: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1659: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1660: but they will not be used. Allocate them just for easy debugging.
1661: */
1662: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1664: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1665: fact->factortype = MAT_FACTOR_ICC;
1666: fact->info.factor_mallocs = 0;
1667: fact->info.fill_ratio_given = info->fill;
1668: fact->info.fill_ratio_needed = 1.0;
1670: aij->row = NULL;
1671: aij->col = NULL;
1673: /* ====================================================================== */
1674: /* Copy A's i, j to fact and also allocate the value array of fact. */
1675: /* We'll do in-place factorization on fact */
1676: /* ====================================================================== */
1677: const int *Ai, *Aj;
1679: m = fact->rmap->n;
1680: nz = aij->nz;
1682: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1683: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1684: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1685: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1686: PetscCallHIP(hipMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1687: PetscCallHIP(hipMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1689: /* ====================================================================== */
1690: /* Create mat descriptors for M, L */
1691: /* ====================================================================== */
1692: hipsparseFillMode_t fillMode;
1693: hipsparseDiagType_t diagType;
1695: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1696: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1697: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1699: /* https://docs.amd.com/bundle/hipSPARSE-Documentation---hipSPARSE-documentation/page/usermanual.html/#hipsparse_8h_1a79e036b6c0680cb37e2aa53d3542a054
1700: hipsparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1701: assumed to be present, but if HIPSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1702: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1703: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1704: */
1705: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1706: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1707: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1708: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1709: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1711: /* ========================================================================= */
1712: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1713: /* ========================================================================= */
1714: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1715: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1717: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1718: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1720: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1721: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1723: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1724: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1726: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1727: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1729: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1730: See also comments in `MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0()`.
1731: */
1732: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1733: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1734: fs->spsvBuffer_L = fs->factBuffer_M;
1735: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1736: } else {
1737: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1738: fs->spsvBuffer_Lt = fs->factBuffer_M;
1739: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1740: }
1742: /* ========================================================================== */
1743: /* Perform analysis of ic0 on M */
1744: /* The lower triangular part of M has the same sparsity pattern as L */
1745: /* ========================================================================== */
1746: int structural_zero;
1748: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1749: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1750: if (PetscDefined(USE_DEBUG)) {
1751: hipsparseStatus_t status;
1752: /* Function hipsparseXcsric02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
1753: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1754: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1755: }
1757: /* Estimate FLOPs of the numeric factorization */
1758: {
1759: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1760: PetscInt *Ai, nzRow, nzLeft;
1761: PetscLogDouble flops = 0.0;
1763: Ai = Aseq->i;
1764: for (PetscInt i = 0; i < m; i++) {
1765: nzRow = Ai[i + 1] - Ai[i];
1766: if (nzRow > 1) {
1767: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1768: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1769: */
1770: nzLeft = (nzRow - 1) / 2;
1771: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1772: }
1773: }
1774: fs->numericFactFlops = flops;
1775: }
1776: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1777: PetscFunctionReturn(PETSC_SUCCESS);
1778: }
1779: #endif
1781: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1782: {
1783: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1785: PetscFunctionBegin;
1786: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1787: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1788: if (hipsparseTriFactors->factorizeOnDevice) {
1789: PetscCall(ISIdentity(isrow, &row_identity));
1790: PetscCall(ISIdentity(iscol, &col_identity));
1791: }
1792: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1793: else
1794: #endif
1795: {
1796: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1797: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1798: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1799: }
1800: PetscFunctionReturn(PETSC_SUCCESS);
1801: }
1803: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1804: {
1805: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1807: PetscFunctionBegin;
1808: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1809: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1810: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1811: PetscFunctionReturn(PETSC_SUCCESS);
1812: }
1814: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1815: {
1816: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1818: PetscFunctionBegin;
1819: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1820: PetscBool perm_identity = PETSC_FALSE;
1821: if (hipsparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1822: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1823: else
1824: #endif
1825: {
1826: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1827: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1828: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1829: }
1830: PetscFunctionReturn(PETSC_SUCCESS);
1831: }
1833: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1834: {
1835: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1837: PetscFunctionBegin;
1838: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1839: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1840: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1841: PetscFunctionReturn(PETSC_SUCCESS);
1842: }
1844: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1845: {
1846: PetscFunctionBegin;
1847: *type = MATSOLVERHIPSPARSE;
1848: PetscFunctionReturn(PETSC_SUCCESS);
1849: }
1851: /*MC
1852: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1853: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1854: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1855: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1856: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1857: algorithms are not recommended. This class does NOT support direct solver operations.
1859: Level: beginner
1861: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1862: M*/
1864: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1865: {
1866: PetscInt n = A->rmap->n;
1867: PetscBool factOnDevice, factOnHost;
1868: char *prefix;
1869: char factPlace[32] = "device"; /* the default */
1871: PetscFunctionBegin;
1872: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1873: PetscCall(MatSetSizes(*B, n, n, n, n));
1874: (*B)->factortype = ftype;
1875: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1877: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1878: PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
1879: PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1880: PetscOptionsEnd();
1881: PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1882: PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1883: PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1884: ((Mat_SeqAIJHIPSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1886: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1887: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1888: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1889: if (!A->boundtocpu) {
1890: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1891: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1892: } else {
1893: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1894: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1895: }
1896: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1897: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1898: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1899: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1900: if (!A->boundtocpu) {
1901: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1902: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1903: } else {
1904: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1905: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1906: }
1907: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1908: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1909: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1911: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1912: (*B)->canuseordering = PETSC_TRUE;
1913: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1914: PetscFunctionReturn(PETSC_SUCCESS);
1915: }
1917: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1918: {
1919: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1920: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1921: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1922: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1923: #endif
1925: PetscFunctionBegin;
1926: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1927: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1928: if (A->factortype == MAT_FACTOR_NONE) {
1929: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1930: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1931: }
1932: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
1933: else if (fs->csrVal) {
1934: /* We have a factorized matrix on device and are able to copy it to host */
1935: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1936: }
1937: #endif
1938: else
1939: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1940: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1941: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1942: A->offloadmask = PETSC_OFFLOAD_BOTH;
1943: }
1944: PetscFunctionReturn(PETSC_SUCCESS);
1945: }
1947: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1948: {
1949: PetscFunctionBegin;
1950: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1951: *array = ((Mat_SeqAIJ *)A->data)->a;
1952: PetscFunctionReturn(PETSC_SUCCESS);
1953: }
1955: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1956: {
1957: PetscFunctionBegin;
1958: A->offloadmask = PETSC_OFFLOAD_CPU;
1959: *array = NULL;
1960: PetscFunctionReturn(PETSC_SUCCESS);
1961: }
1963: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1964: {
1965: PetscFunctionBegin;
1966: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1967: *array = ((Mat_SeqAIJ *)A->data)->a;
1968: PetscFunctionReturn(PETSC_SUCCESS);
1969: }
1971: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1972: {
1973: PetscFunctionBegin;
1974: *array = NULL;
1975: PetscFunctionReturn(PETSC_SUCCESS);
1976: }
1978: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1979: {
1980: PetscFunctionBegin;
1981: *array = ((Mat_SeqAIJ *)A->data)->a;
1982: PetscFunctionReturn(PETSC_SUCCESS);
1983: }
1985: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1986: {
1987: PetscFunctionBegin;
1988: A->offloadmask = PETSC_OFFLOAD_CPU;
1989: *array = NULL;
1990: PetscFunctionReturn(PETSC_SUCCESS);
1991: }
1993: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1994: {
1995: Mat_SeqAIJHIPSPARSE *cusp;
1996: CsrMatrix *matrix;
1998: PetscFunctionBegin;
1999: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2000: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2001: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
2002: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2003: matrix = (CsrMatrix *)cusp->mat->mat;
2005: if (i) {
2006: #if !defined(PETSC_USE_64BIT_INDICES)
2007: *i = matrix->row_offsets->data().get();
2008: #else
2009: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2010: #endif
2011: }
2012: if (j) {
2013: #if !defined(PETSC_USE_64BIT_INDICES)
2014: *j = matrix->column_indices->data().get();
2015: #else
2016: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSparse does not supported 64-bit indices");
2017: #endif
2018: }
2019: if (a) *a = matrix->values->data().get();
2020: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
2021: PetscFunctionReturn(PETSC_SUCCESS);
2022: }
2024: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
2025: {
2026: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2027: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
2028: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2029: PetscBool both = PETSC_TRUE;
2030: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2032: PetscFunctionBegin;
2033: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2034: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2035: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
2036: CsrMatrix *matrix;
2037: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
2039: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2040: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2041: matrix->values->assign(a->a, a->a + a->nz);
2042: PetscCallHIP(WaitForHIP());
2043: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2044: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2045: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
2046: } else {
2047: PetscInt nnz;
2048: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2049: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
2050: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
2051: delete hipsparsestruct->workVector;
2052: delete hipsparsestruct->rowoffsets_gpu;
2053: hipsparsestruct->workVector = NULL;
2054: hipsparsestruct->rowoffsets_gpu = NULL;
2055: try {
2056: if (a->compressedrow.use) {
2057: m = a->compressedrow.nrows;
2058: ii = a->compressedrow.i;
2059: ridx = a->compressedrow.rindex;
2060: } else {
2061: m = A->rmap->n;
2062: ii = a->i;
2063: ridx = NULL;
2064: }
2065: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2066: if (!a->a) {
2067: nnz = ii[m];
2068: both = PETSC_FALSE;
2069: } else nnz = a->nz;
2070: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2072: /* create hipsparse matrix */
2073: hipsparsestruct->nrows = m;
2074: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
2075: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
2076: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
2077: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2079: PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2080: PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2081: PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2082: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2083: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2084: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2085: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2087: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2088: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2089: /* set the matrix */
2090: CsrMatrix *mat = new CsrMatrix;
2091: mat->num_rows = m;
2092: mat->num_cols = A->cmap->n;
2093: mat->num_entries = nnz;
2094: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2095: mat->column_indices = new THRUSTINTARRAY32(nnz);
2096: mat->values = new THRUSTARRAY(nnz);
2097: mat->row_offsets->assign(ii, ii + m + 1);
2098: mat->column_indices->assign(a->j, a->j + nnz);
2099: if (a->a) mat->values->assign(a->a, a->a + nnz);
2101: /* assign the pointer */
2102: matstruct->mat = mat;
2103: if (mat->num_rows) { /* hipsparse errors on empty matrices! */
2104: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2105: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2106: }
2107: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
2108: CsrMatrix *mat = new CsrMatrix;
2109: mat->num_rows = m;
2110: mat->num_cols = A->cmap->n;
2111: mat->num_entries = nnz;
2112: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2113: mat->column_indices = new THRUSTINTARRAY32(nnz);
2114: mat->values = new THRUSTARRAY(nnz);
2115: mat->row_offsets->assign(ii, ii + m + 1);
2116: mat->column_indices->assign(a->j, a->j + nnz);
2117: if (a->a) mat->values->assign(a->a, a->a + nnz);
2119: hipsparseHybMat_t hybMat;
2120: PetscCallHIPSPARSE(hipsparseCreateHybMat(&hybMat));
2121: hipsparseHybPartition_t partition = hipsparsestruct->format == MAT_HIPSPARSE_ELL ? HIPSPARSE_HYB_PARTITION_MAX : HIPSPARSE_HYB_PARTITION_AUTO;
2122: PetscCallHIPSPARSE(hipsparse_csr2hyb(hipsparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition));
2123: /* assign the pointer */
2124: matstruct->mat = hybMat;
2126: if (mat) {
2127: if (mat->values) delete (THRUSTARRAY *)mat->values;
2128: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2129: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2130: delete (CsrMatrix *)mat;
2131: }
2132: }
2134: /* assign the compressed row indices */
2135: if (a->compressedrow.use) {
2136: hipsparsestruct->workVector = new THRUSTARRAY(m);
2137: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2138: matstruct->cprowIndices->assign(ridx, ridx + m);
2139: tmp = m;
2140: } else {
2141: hipsparsestruct->workVector = NULL;
2142: matstruct->cprowIndices = NULL;
2143: tmp = 0;
2144: }
2145: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2147: /* assign the pointer */
2148: hipsparsestruct->mat = matstruct;
2149: } catch (char *ex) {
2150: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2151: }
2152: PetscCallHIP(WaitForHIP());
2153: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
2154: hipsparsestruct->nonzerostate = A->nonzerostate;
2155: }
2156: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2157: }
2158: PetscFunctionReturn(PETSC_SUCCESS);
2159: }
2161: struct VecHIPPlusEquals {
2162: template <typename Tuple>
2163: __host__ __device__ void operator()(Tuple t)
2164: {
2165: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2166: }
2167: };
2169: struct VecHIPEquals {
2170: template <typename Tuple>
2171: __host__ __device__ void operator()(Tuple t)
2172: {
2173: thrust::get<1>(t) = thrust::get<0>(t);
2174: }
2175: };
2177: struct VecHIPEqualsReverse {
2178: template <typename Tuple>
2179: __host__ __device__ void operator()(Tuple t)
2180: {
2181: thrust::get<0>(t) = thrust::get<1>(t);
2182: }
2183: };
2185: struct MatMatHipsparse {
2186: PetscBool cisdense;
2187: PetscScalar *Bt;
2188: Mat X;
2189: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
2190: PetscLogDouble flops;
2191: CsrMatrix *Bcsr;
2192: hipsparseSpMatDescr_t matSpBDescr;
2193: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2194: hipsparseDnMatDescr_t matBDescr;
2195: hipsparseDnMatDescr_t matCDescr;
2196: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2197: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2198: void *dBuffer4, *dBuffer5;
2199: #endif
2200: size_t mmBufferSize;
2201: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2202: hipsparseSpGEMMDescr_t spgemmDesc;
2203: };
2205: static PetscErrorCode MatDestroy_MatMatHipsparse(void *data)
2206: {
2207: MatMatHipsparse *mmdata = (MatMatHipsparse *)data;
2209: PetscFunctionBegin;
2210: PetscCallHIP(hipFree(mmdata->Bt));
2211: delete mmdata->Bcsr;
2212: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
2213: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2214: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2215: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2216: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2217: if (mmdata->dBuffer4) PetscCallHIP(hipFree(mmdata->dBuffer4));
2218: if (mmdata->dBuffer5) PetscCallHIP(hipFree(mmdata->dBuffer5));
2219: #endif
2220: if (mmdata->mmBuffer) PetscCallHIP(hipFree(mmdata->mmBuffer));
2221: if (mmdata->mmBuffer2) PetscCallHIP(hipFree(mmdata->mmBuffer2));
2222: PetscCall(MatDestroy(&mmdata->X));
2223: PetscCall(PetscFree(data));
2224: PetscFunctionReturn(PETSC_SUCCESS);
2225: }
2227: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2228: {
2229: Mat_Product *product = C->product;
2230: Mat A, B;
2231: PetscInt m, n, blda, clda;
2232: PetscBool flg, biship;
2233: Mat_SeqAIJHIPSPARSE *cusp;
2234: hipsparseOperation_t opA;
2235: const PetscScalar *barray;
2236: PetscScalar *carray;
2237: MatMatHipsparse *mmdata;
2238: Mat_SeqAIJHIPSPARSEMultStruct *mat;
2239: CsrMatrix *csrmat;
2241: PetscFunctionBegin;
2242: MatCheckProduct(C, 1);
2243: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2244: mmdata = (MatMatHipsparse *)product->data;
2245: A = product->A;
2246: B = product->B;
2247: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2248: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2249: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2250: Instead of silently accepting the wrong answer, I prefer to raise the error */
2251: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2253: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2254: switch (product->type) {
2255: case MATPRODUCT_AB:
2256: case MATPRODUCT_PtAP:
2257: mat = cusp->mat;
2258: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2259: m = A->rmap->n;
2260: n = B->cmap->n;
2261: break;
2262: case MATPRODUCT_AtB:
2263: if (!A->form_explicit_transpose) {
2264: mat = cusp->mat;
2265: opA = HIPSPARSE_OPERATION_TRANSPOSE;
2266: } else {
2267: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2268: mat = cusp->matTranspose;
2269: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2270: }
2271: m = A->cmap->n;
2272: n = B->cmap->n;
2273: break;
2274: case MATPRODUCT_ABt:
2275: case MATPRODUCT_RARt:
2276: mat = cusp->mat;
2277: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2278: m = A->rmap->n;
2279: n = B->rmap->n;
2280: break;
2281: default:
2282: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2283: }
2284: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
2285: csrmat = (CsrMatrix *)mat->mat;
2286: /* if the user passed a CPU matrix, copy the data to the GPU */
2287: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
2288: if (!biship) { PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B)); }
2289: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2290: PetscCall(MatDenseGetLDA(B, &blda));
2291: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2292: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2293: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2294: } else {
2295: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2296: PetscCall(MatDenseGetLDA(C, &clda));
2297: }
2299: PetscCall(PetscLogGpuTimeBegin());
2300: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
2301: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2302: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2303: size_t mmBufferSize;
2304: if (mmdata->initialized && mmdata->Blda != blda) {
2305: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
2306: mmdata->matBDescr = NULL;
2307: }
2308: if (!mmdata->matBDescr) {
2309: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2310: mmdata->Blda = blda;
2311: }
2312: if (mmdata->initialized && mmdata->Clda != clda) {
2313: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
2314: mmdata->matCDescr = NULL;
2315: }
2316: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2317: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
2318: mmdata->Clda = clda;
2319: }
2320: if (!mat->matDescr) {
2321: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2322: HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2323: }
2324: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2325: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2326: PetscCallHIP(hipFree(mmdata->mmBuffer));
2327: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
2328: mmdata->mmBufferSize = mmBufferSize;
2329: }
2330: mmdata->initialized = PETSC_TRUE;
2331: } else {
2332: /* to be safe, always update pointers of the mats */
2333: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
2334: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2335: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2336: }
2338: /* do hipsparseSpMM, which supports transpose on B */
2339: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2341: PetscCall(PetscLogGpuTimeEnd());
2342: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2343: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2344: if (product->type == MATPRODUCT_RARt) {
2345: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2346: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2347: } else if (product->type == MATPRODUCT_PtAP) {
2348: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2349: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2350: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2351: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2352: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2353: PetscFunctionReturn(PETSC_SUCCESS);
2354: }
2356: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
2357: {
2358: Mat_Product *product = C->product;
2359: Mat A, B;
2360: PetscInt m, n;
2361: PetscBool cisdense, flg;
2362: MatMatHipsparse *mmdata;
2363: Mat_SeqAIJHIPSPARSE *cusp;
2365: PetscFunctionBegin;
2366: MatCheckProduct(C, 1);
2367: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2368: A = product->A;
2369: B = product->B;
2370: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2371: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2372: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2373: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2374: switch (product->type) {
2375: case MATPRODUCT_AB:
2376: m = A->rmap->n;
2377: n = B->cmap->n;
2378: break;
2379: case MATPRODUCT_AtB:
2380: m = A->cmap->n;
2381: n = B->cmap->n;
2382: break;
2383: case MATPRODUCT_ABt:
2384: m = A->rmap->n;
2385: n = B->rmap->n;
2386: break;
2387: case MATPRODUCT_PtAP:
2388: m = B->cmap->n;
2389: n = B->cmap->n;
2390: break;
2391: case MATPRODUCT_RARt:
2392: m = B->rmap->n;
2393: n = B->rmap->n;
2394: break;
2395: default:
2396: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2397: }
2398: PetscCall(MatSetSizes(C, m, n, m, n));
2399: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2400: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2401: PetscCall(MatSetType(C, MATSEQDENSEHIP));
2403: /* product data */
2404: PetscCall(PetscNew(&mmdata));
2405: mmdata->cisdense = cisdense;
2406: /* for these products we need intermediate storage */
2407: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2408: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2409: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
2410: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
2411: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2412: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2413: }
2414: C->product->data = mmdata;
2415: C->product->destroy = MatDestroy_MatMatHipsparse;
2416: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
2417: PetscFunctionReturn(PETSC_SUCCESS);
2418: }
2420: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2421: {
2422: Mat_Product *product = C->product;
2423: Mat A, B;
2424: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2425: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2426: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2427: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2428: PetscBool flg;
2429: MatProductType ptype;
2430: MatMatHipsparse *mmdata;
2431: hipsparseSpMatDescr_t BmatSpDescr;
2432: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2434: PetscFunctionBegin;
2435: MatCheckProduct(C, 1);
2436: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2437: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
2438: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2439: mmdata = (MatMatHipsparse *)C->product->data;
2440: A = product->A;
2441: B = product->B;
2442: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2443: mmdata->reusesym = PETSC_FALSE;
2444: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2445: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2446: Cmat = Ccusp->mat;
2447: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2448: Ccsr = (CsrMatrix *)Cmat->mat;
2449: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2450: goto finalize;
2451: }
2452: if (!c->nz) goto finalize;
2453: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2454: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2455: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2456: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2457: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2458: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2459: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2460: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2461: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2462: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2463: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2464: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2465: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2466: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2468: ptype = product->type;
2469: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2470: ptype = MATPRODUCT_AB;
2471: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2472: }
2473: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2474: ptype = MATPRODUCT_AB;
2475: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2476: }
2477: switch (ptype) {
2478: case MATPRODUCT_AB:
2479: Amat = Acusp->mat;
2480: Bmat = Bcusp->mat;
2481: break;
2482: case MATPRODUCT_AtB:
2483: Amat = Acusp->matTranspose;
2484: Bmat = Bcusp->mat;
2485: break;
2486: case MATPRODUCT_ABt:
2487: Amat = Acusp->mat;
2488: Bmat = Bcusp->matTranspose;
2489: break;
2490: default:
2491: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2492: }
2493: Cmat = Ccusp->mat;
2494: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2495: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2496: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2497: Acsr = (CsrMatrix *)Amat->mat;
2498: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2499: Ccsr = (CsrMatrix *)Cmat->mat;
2500: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2501: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2502: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2503: PetscCall(PetscLogGpuTimeBegin());
2504: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2505: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2506: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2507: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2508: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2509: #else
2510: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2511: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2512: #endif
2513: #else
2514: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2515: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2516: Ccsr->column_indices->data().get()));
2517: #endif
2518: PetscCall(PetscLogGpuFlops(mmdata->flops));
2519: PetscCallHIP(WaitForHIP());
2520: PetscCall(PetscLogGpuTimeEnd());
2521: C->offloadmask = PETSC_OFFLOAD_GPU;
2522: finalize:
2523: /* shorter version of MatAssemblyEnd_SeqAIJ */
2524: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2525: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2526: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2527: c->reallocs = 0;
2528: C->info.mallocs += 0;
2529: C->info.nz_unneeded = 0;
2530: C->assembled = C->was_assembled = PETSC_TRUE;
2531: C->num_ass++;
2532: PetscFunctionReturn(PETSC_SUCCESS);
2533: }
2535: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
2536: {
2537: Mat_Product *product = C->product;
2538: Mat A, B;
2539: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
2540: Mat_SeqAIJ *a, *b, *c;
2541: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
2542: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2543: PetscInt i, j, m, n, k;
2544: PetscBool flg;
2545: MatProductType ptype;
2546: MatMatHipsparse *mmdata;
2547: PetscLogDouble flops;
2548: PetscBool biscompressed, ciscompressed;
2549: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2550: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2551: hipsparseSpMatDescr_t BmatSpDescr;
2552: #else
2553: int cnz;
2554: #endif
2555: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
2557: PetscFunctionBegin;
2558: MatCheckProduct(C, 1);
2559: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2560: A = product->A;
2561: B = product->B;
2562: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
2563: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2564: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
2565: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2566: a = (Mat_SeqAIJ *)A->data;
2567: b = (Mat_SeqAIJ *)B->data;
2568: /* product data */
2569: PetscCall(PetscNew(&mmdata));
2570: C->product->data = mmdata;
2571: C->product->destroy = MatDestroy_MatMatHipsparse;
2573: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2574: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2575: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2576: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2577: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2578: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2580: ptype = product->type;
2581: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2582: ptype = MATPRODUCT_AB;
2583: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2584: }
2585: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2586: ptype = MATPRODUCT_AB;
2587: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2588: }
2589: biscompressed = PETSC_FALSE;
2590: ciscompressed = PETSC_FALSE;
2591: switch (ptype) {
2592: case MATPRODUCT_AB:
2593: m = A->rmap->n;
2594: n = B->cmap->n;
2595: k = A->cmap->n;
2596: Amat = Acusp->mat;
2597: Bmat = Bcusp->mat;
2598: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2599: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2600: break;
2601: case MATPRODUCT_AtB:
2602: m = A->cmap->n;
2603: n = B->cmap->n;
2604: k = A->rmap->n;
2605: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2606: Amat = Acusp->matTranspose;
2607: Bmat = Bcusp->mat;
2608: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2609: break;
2610: case MATPRODUCT_ABt:
2611: m = A->rmap->n;
2612: n = B->rmap->n;
2613: k = A->cmap->n;
2614: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2615: Amat = Acusp->mat;
2616: Bmat = Bcusp->matTranspose;
2617: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2618: break;
2619: default:
2620: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2621: }
2623: /* create hipsparse matrix */
2624: PetscCall(MatSetSizes(C, m, n, m, n));
2625: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2626: c = (Mat_SeqAIJ *)C->data;
2627: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2628: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2629: Ccsr = new CsrMatrix;
2631: c->compressedrow.use = ciscompressed;
2632: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2633: c->compressedrow.nrows = a->compressedrow.nrows;
2634: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2635: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2636: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2637: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2638: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2639: } else {
2640: c->compressedrow.nrows = 0;
2641: c->compressedrow.i = NULL;
2642: c->compressedrow.rindex = NULL;
2643: Ccusp->workVector = NULL;
2644: Cmat->cprowIndices = NULL;
2645: }
2646: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2647: Ccusp->mat = Cmat;
2648: Ccusp->mat->mat = Ccsr;
2649: Ccsr->num_rows = Ccusp->nrows;
2650: Ccsr->num_cols = n;
2651: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2652: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2653: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2654: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2655: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2656: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2657: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2658: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2659: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2660: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2661: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2662: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2663: c->nz = 0;
2664: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2665: Ccsr->values = new THRUSTARRAY(c->nz);
2666: goto finalizesym;
2667: }
2669: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2670: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2671: Acsr = (CsrMatrix *)Amat->mat;
2672: if (!biscompressed) {
2673: Bcsr = (CsrMatrix *)Bmat->mat;
2674: BmatSpDescr = Bmat->matDescr;
2675: } else { /* we need to use row offsets for the full matrix */
2676: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2677: Bcsr = new CsrMatrix;
2678: Bcsr->num_rows = B->rmap->n;
2679: Bcsr->num_cols = cBcsr->num_cols;
2680: Bcsr->num_entries = cBcsr->num_entries;
2681: Bcsr->column_indices = cBcsr->column_indices;
2682: Bcsr->values = cBcsr->values;
2683: if (!Bcusp->rowoffsets_gpu) {
2684: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2685: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2686: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2687: }
2688: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2689: mmdata->Bcsr = Bcsr;
2690: if (Bcsr->num_rows && Bcsr->num_cols) {
2691: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2692: }
2693: BmatSpDescr = mmdata->matSpBDescr;
2694: }
2695: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2696: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2697: /* precompute flops count */
2698: if (ptype == MATPRODUCT_AB) {
2699: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2700: const PetscInt st = a->i[i];
2701: const PetscInt en = a->i[i + 1];
2702: for (j = st; j < en; j++) {
2703: const PetscInt brow = a->j[j];
2704: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2705: }
2706: }
2707: } else if (ptype == MATPRODUCT_AtB) {
2708: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2709: const PetscInt anzi = a->i[i + 1] - a->i[i];
2710: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2711: flops += (2. * anzi) * bnzi;
2712: }
2713: } else flops = 0.; /* TODO */
2715: mmdata->flops = flops;
2716: PetscCall(PetscLogGpuTimeBegin());
2717: #if PETSC_PKG_HIP_VERSION_GE(5, 0, 0)
2718: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2719: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2720: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2721: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
2722: {
2723: /* hipsparseSpGEMMreuse has more reasonable APIs than hipsparseSpGEMM, so we prefer to use it.
2724: We follow the sample code at https://github.com/ROCmSoftwarePlatform/hipSPARSE/blob/develop/clients/include/testing_spgemmreuse_csr.hpp
2725: */
2726: void *dBuffer1 = NULL;
2727: void *dBuffer2 = NULL;
2728: void *dBuffer3 = NULL;
2729: /* dBuffer4, dBuffer5 are needed by hipsparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2730: size_t bufferSize1 = 0;
2731: size_t bufferSize2 = 0;
2732: size_t bufferSize3 = 0;
2733: size_t bufferSize4 = 0;
2734: size_t bufferSize5 = 0;
2736: /* ask bufferSize1 bytes for external memory */
2737: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL));
2738: PetscCallHIP(hipMalloc((void **)&dBuffer1, bufferSize1));
2739: /* inspect the matrices A and B to understand the memory requirement for the next step */
2740: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1));
2742: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL));
2743: PetscCallHIP(hipMalloc((void **)&dBuffer2, bufferSize2));
2744: PetscCallHIP(hipMalloc((void **)&dBuffer3, bufferSize3));
2745: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2746: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4));
2747: PetscCallHIP(hipFree(dBuffer1));
2748: PetscCallHIP(hipFree(dBuffer2));
2750: /* get matrix C non-zero entries C_nnz1 */
2751: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2752: c->nz = (PetscInt)C_nnz1;
2753: /* allocate matrix C */
2754: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2755: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2756: Ccsr->values = new THRUSTARRAY(c->nz);
2757: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2758: /* update matC with the new pointers */
2759: if (c->nz) { /* 5.5.1 has a bug with nz = 0, exposed by mat_tests_ex123_2_hypre */
2760: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2762: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL));
2763: PetscCallHIP(hipMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2764: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5));
2765: PetscCallHIP(hipFree(dBuffer3));
2766: PetscCallHIPSPARSE(hipsparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2767: }
2768: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2769: }
2770: #else
2771: size_t bufSize2;
2772: /* ask bufferSize bytes for external memory */
2773: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2774: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2775: /* inspect the matrices A and B to understand the memory requirement for the next step */
2776: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2777: /* ask bufferSize again bytes for external memory */
2778: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2779: /* Similar to CUSPARSE, we need both buffers to perform the operations properly!
2780: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2781: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2782: is stored in the descriptor! What a messy API... */
2783: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2784: /* compute the intermediate product of A * B */
2785: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2786: /* get matrix C non-zero entries C_nnz1 */
2787: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2788: c->nz = (PetscInt)C_nnz1;
2789: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2790: mmdata->mmBufferSize / 1024));
2791: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2792: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2793: Ccsr->values = new THRUSTARRAY(c->nz);
2794: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2795: PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2796: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2797: #endif
2798: #else
2799: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_HOST));
2800: PetscCallHIPSPARSE(hipsparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2801: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz));
2802: c->nz = cnz;
2803: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2804: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2805: Ccsr->values = new THRUSTARRAY(c->nz);
2806: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2808: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2809: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2810: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2811: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2812: PetscCallHIPSPARSE(hipsparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr,
2813: Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(),
2814: Ccsr->column_indices->data().get()));
2815: #endif
2816: PetscCall(PetscLogGpuFlops(mmdata->flops));
2817: PetscCall(PetscLogGpuTimeEnd());
2818: finalizesym:
2819: c->singlemalloc = PETSC_FALSE;
2820: c->free_a = PETSC_TRUE;
2821: c->free_ij = PETSC_TRUE;
2822: PetscCall(PetscMalloc1(m + 1, &c->i));
2823: PetscCall(PetscMalloc1(c->nz, &c->j));
2824: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2825: PetscInt *d_i = c->i;
2826: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2827: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2828: ii = *Ccsr->row_offsets;
2829: jj = *Ccsr->column_indices;
2830: if (ciscompressed) d_i = c->compressedrow.i;
2831: PetscCallHIP(hipMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2832: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2833: } else {
2834: PetscInt *d_i = c->i;
2835: if (ciscompressed) d_i = c->compressedrow.i;
2836: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2837: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2838: }
2839: if (ciscompressed) { /* need to expand host row offsets */
2840: PetscInt r = 0;
2841: c->i[0] = 0;
2842: for (k = 0; k < c->compressedrow.nrows; k++) {
2843: const PetscInt next = c->compressedrow.rindex[k];
2844: const PetscInt old = c->compressedrow.i[k];
2845: for (; r < next; r++) c->i[r + 1] = old;
2846: }
2847: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2848: }
2849: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2850: PetscCall(PetscMalloc1(m, &c->ilen));
2851: PetscCall(PetscMalloc1(m, &c->imax));
2852: c->maxnz = c->nz;
2853: c->nonzerorowcnt = 0;
2854: c->rmax = 0;
2855: for (k = 0; k < m; k++) {
2856: const PetscInt nn = c->i[k + 1] - c->i[k];
2857: c->ilen[k] = c->imax[k] = nn;
2858: c->nonzerorowcnt += (PetscInt) !!nn;
2859: c->rmax = PetscMax(c->rmax, nn);
2860: }
2861: PetscCall(MatMarkDiagonal_SeqAIJ(C));
2862: PetscCall(PetscMalloc1(c->nz, &c->a));
2863: Ccsr->num_entries = c->nz;
2865: C->nonzerostate++;
2866: PetscCall(PetscLayoutSetUp(C->rmap));
2867: PetscCall(PetscLayoutSetUp(C->cmap));
2868: Ccusp->nonzerostate = C->nonzerostate;
2869: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2870: C->preallocated = PETSC_TRUE;
2871: C->assembled = PETSC_FALSE;
2872: C->was_assembled = PETSC_FALSE;
2873: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2874: mmdata->reusesym = PETSC_TRUE;
2875: C->offloadmask = PETSC_OFFLOAD_GPU;
2876: }
2877: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2878: PetscFunctionReturn(PETSC_SUCCESS);
2879: }
2881: /* handles sparse or dense B */
2882: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2883: {
2884: Mat_Product *product = mat->product;
2885: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2887: PetscFunctionBegin;
2888: MatCheckProduct(mat, 1);
2889: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2890: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2891: if (product->type == MATPRODUCT_ABC) {
2892: Ciscusp = PETSC_FALSE;
2893: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2894: }
2895: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2896: PetscBool usecpu = PETSC_FALSE;
2897: switch (product->type) {
2898: case MATPRODUCT_AB:
2899: if (product->api_user) {
2900: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2901: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2902: PetscOptionsEnd();
2903: } else {
2904: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2905: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2906: PetscOptionsEnd();
2907: }
2908: break;
2909: case MATPRODUCT_AtB:
2910: if (product->api_user) {
2911: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2912: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2913: PetscOptionsEnd();
2914: } else {
2915: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2916: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2917: PetscOptionsEnd();
2918: }
2919: break;
2920: case MATPRODUCT_PtAP:
2921: if (product->api_user) {
2922: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2923: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2924: PetscOptionsEnd();
2925: } else {
2926: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2927: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2928: PetscOptionsEnd();
2929: }
2930: break;
2931: case MATPRODUCT_RARt:
2932: if (product->api_user) {
2933: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2934: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2935: PetscOptionsEnd();
2936: } else {
2937: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2938: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2939: PetscOptionsEnd();
2940: }
2941: break;
2942: case MATPRODUCT_ABC:
2943: if (product->api_user) {
2944: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2945: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2946: PetscOptionsEnd();
2947: } else {
2948: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2949: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2950: PetscOptionsEnd();
2951: }
2952: break;
2953: default:
2954: break;
2955: }
2956: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2957: }
2958: /* dispatch */
2959: if (isdense) {
2960: switch (product->type) {
2961: case MATPRODUCT_AB:
2962: case MATPRODUCT_AtB:
2963: case MATPRODUCT_ABt:
2964: case MATPRODUCT_PtAP:
2965: case MATPRODUCT_RARt:
2966: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2967: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2968: break;
2969: case MATPRODUCT_ABC:
2970: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2971: break;
2972: default:
2973: break;
2974: }
2975: } else if (Biscusp && Ciscusp) {
2976: switch (product->type) {
2977: case MATPRODUCT_AB:
2978: case MATPRODUCT_AtB:
2979: case MATPRODUCT_ABt:
2980: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2981: break;
2982: case MATPRODUCT_PtAP:
2983: case MATPRODUCT_RARt:
2984: case MATPRODUCT_ABC:
2985: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2986: break;
2987: default:
2988: break;
2989: }
2990: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2991: PetscFunctionReturn(PETSC_SUCCESS);
2992: }
2994: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2995: {
2996: PetscFunctionBegin;
2997: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2998: PetscFunctionReturn(PETSC_SUCCESS);
2999: }
3001: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3002: {
3003: PetscFunctionBegin;
3004: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3005: PetscFunctionReturn(PETSC_SUCCESS);
3006: }
3008: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3009: {
3010: PetscFunctionBegin;
3011: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3012: PetscFunctionReturn(PETSC_SUCCESS);
3013: }
3015: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3016: {
3017: PetscFunctionBegin;
3018: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3019: PetscFunctionReturn(PETSC_SUCCESS);
3020: }
3022: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
3023: {
3024: PetscFunctionBegin;
3025: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3026: PetscFunctionReturn(PETSC_SUCCESS);
3027: }
3029: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3030: {
3031: int i = blockIdx.x * blockDim.x + threadIdx.x;
3032: if (i < n) y[idx[i]] += x[i];
3033: }
3035: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3036: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3037: {
3038: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3039: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3040: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
3041: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3042: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
3043: PetscBool compressed;
3044: PetscInt nx, ny;
3046: PetscFunctionBegin;
3047: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3048: if (!a->nz) {
3049: if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
3050: else PetscCall(VecSeq_HIP::Set(zz, 0));
3051: PetscFunctionReturn(PETSC_SUCCESS);
3052: }
3053: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3054: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3055: if (!trans) {
3056: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3057: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
3058: } else {
3059: if (herm || !A->form_explicit_transpose) {
3060: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
3061: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
3062: } else {
3063: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3064: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
3065: }
3066: }
3067: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3068: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3069: try {
3070: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
3071: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3072: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3074: PetscCall(PetscLogGpuTimeBegin());
3075: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3076: /* z = A x + beta y.
3077: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3078: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3079: */
3080: xptr = xarray;
3081: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
3082: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3083: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3084: allocated to accommodate different uses. So we get the length info directly from mat.
3085: */
3086: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3087: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3088: nx = mat->num_cols;
3089: ny = mat->num_rows;
3090: }
3091: } else {
3092: /* z = A^T x + beta y
3093: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3094: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3095: */
3096: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
3097: dptr = zarray;
3098: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3099: if (compressed) { /* Scatter x to work vector */
3100: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3101: thrust::for_each(
3102: #if PetscDefined(HAVE_THRUST_ASYNC)
3103: thrust::hip::par.on(PetscDefaultHipStream),
3104: #endif
3105: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3106: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
3107: }
3108: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3109: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3110: nx = mat->num_rows;
3111: ny = mat->num_cols;
3112: }
3113: }
3114: /* csr_spmv does y = alpha op(A) x + beta y */
3115: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
3116: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0)
3117: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
3118: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
3119: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
3120: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
3121: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
3122: &matstruct->hipSpMV[opA].spmvBufferSize));
3123: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
3124: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
3125: } else {
3126: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3127: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
3128: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
3129: }
3130: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
3131: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
3132: #else
3133: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3134: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3135: #endif
3136: } else {
3137: if (hipsparsestruct->nrows) {
3138: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
3139: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3140: }
3141: }
3142: PetscCall(PetscLogGpuTimeEnd());
3144: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
3145: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3146: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3147: PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
3148: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3149: PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3150: }
3151: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3152: PetscCall(VecSeq_HIP::Set(zz, 0));
3153: }
3155: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3156: if (compressed) {
3157: PetscCall(PetscLogGpuTimeBegin());
3158: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3159: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
3160: prevent that. So I just add a ScatterAdd kernel.
3161: */
3162: #if 0
3163: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3164: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
3165: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3166: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3167: VecHIPPlusEquals());
3168: #else
3169: PetscInt n = matstruct->cprowIndices->size();
3170: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
3171: #endif
3172: PetscCall(PetscLogGpuTimeEnd());
3173: }
3174: } else {
3175: if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
3176: }
3177: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
3178: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
3179: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
3180: } catch (char *ex) {
3181: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
3182: }
3183: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3184: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3185: PetscFunctionReturn(PETSC_SUCCESS);
3186: }
3188: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3189: {
3190: PetscFunctionBegin;
3191: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3192: PetscFunctionReturn(PETSC_SUCCESS);
3193: }
3195: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
3196: {
3197: PetscFunctionBegin;
3198: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3199: PetscFunctionReturn(PETSC_SUCCESS);
3200: }
3202: /*@
3203: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
3204: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
3206: Collective
3208: Input Parameters:
3209: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3210: . m - number of rows
3211: . n - number of columns
3212: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
3213: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3215: Output Parameter:
3216: . A - the matrix
3218: Level: intermediate
3220: Notes:
3221: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3222: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
3223: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
3225: The AIJ format (compressed row storage), is fully compatible with standard Fortran
3226: storage. That is, the stored row and column indices can begin at
3227: either one (as in Fortran) or zero.
3229: Specify the preallocated storage with either `nz` or `nnz` (not both).
3230: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3231: allocation.
3233: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
3234: @*/
3235: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3236: {
3237: PetscFunctionBegin;
3238: PetscCall(MatCreate(comm, A));
3239: PetscCall(MatSetSizes(*A, m, n, m, n));
3240: PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
3241: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3242: PetscFunctionReturn(PETSC_SUCCESS);
3243: }
3245: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
3246: {
3247: PetscFunctionBegin;
3248: if (A->factortype == MAT_FACTOR_NONE) PetscCall(MatSeqAIJHIPSPARSE_Destroy(A));
3249: else PetscCall(MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)&A->spptr));
3250: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3251: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
3252: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetUseCPUSolve_C", NULL));
3253: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3254: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3255: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3256: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3257: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3258: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3259: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijhipsparse_hypre_C", NULL));
3260: PetscCall(MatDestroy_SeqAIJ(A));
3261: PetscFunctionReturn(PETSC_SUCCESS);
3262: }
3264: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3265: {
3266: PetscFunctionBegin;
3267: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3268: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(*B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, B));
3269: PetscFunctionReturn(PETSC_SUCCESS);
3270: }
3272: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3273: {
3274: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3275: Mat_SeqAIJHIPSPARSE *cy;
3276: Mat_SeqAIJHIPSPARSE *cx;
3277: PetscScalar *ay;
3278: const PetscScalar *ax;
3279: CsrMatrix *csry, *csrx;
3281: PetscFunctionBegin;
3282: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
3283: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
3284: if (X->ops->axpy != Y->ops->axpy) {
3285: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3286: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3287: PetscFunctionReturn(PETSC_SUCCESS);
3288: }
3289: /* if we are here, it means both matrices are bound to GPU */
3290: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
3291: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
3292: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3293: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
3294: csry = (CsrMatrix *)cy->mat->mat;
3295: csrx = (CsrMatrix *)cx->mat->mat;
3296: /* see if we can turn this into a hipblas axpy */
3297: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3298: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3299: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3300: if (eq) str = SAME_NONZERO_PATTERN;
3301: }
3302: /* spgeam is buggy with one column */
3303: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3304: if (str == SUBSET_NONZERO_PATTERN) {
3305: PetscScalar b = 1.0;
3306: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3307: size_t bufferSize;
3308: void *buffer;
3309: #endif
3311: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3312: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3313: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
3314: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3315: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3316: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3317: PetscCallHIP(hipMalloc(&buffer, bufferSize));
3318: PetscCall(PetscLogGpuTimeBegin());
3319: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3320: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3321: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3322: PetscCall(PetscLogGpuTimeEnd());
3323: PetscCallHIP(hipFree(buffer));
3324: #else
3325: PetscCall(PetscLogGpuTimeBegin());
3326: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3327: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3328: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3329: PetscCall(PetscLogGpuTimeEnd());
3330: #endif
3331: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
3332: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3333: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3334: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3335: } else if (str == SAME_NONZERO_PATTERN) {
3336: hipblasHandle_t hipblasv2handle;
3337: PetscBLASInt one = 1, bnz = 1;
3339: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
3340: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3341: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3342: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3343: PetscCall(PetscLogGpuTimeBegin());
3344: PetscCallHIPBLAS(hipblasXaxpy(hipblasv2handle, bnz, &a, ax, one, ay, one));
3345: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3346: PetscCall(PetscLogGpuTimeEnd());
3347: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
3348: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3349: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3350: } else {
3351: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3352: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3353: }
3354: PetscFunctionReturn(PETSC_SUCCESS);
3355: }
3357: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
3358: {
3359: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3360: PetscScalar *ay;
3361: hipblasHandle_t hipblasv2handle;
3362: PetscBLASInt one = 1, bnz = 1;
3364: PetscFunctionBegin;
3365: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
3366: PetscCall(PetscHIPBLASGetHandle(&hipblasv2handle));
3367: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3368: PetscCall(PetscLogGpuTimeBegin());
3369: PetscCallHIPBLAS(hipblasXscal(hipblasv2handle, bnz, &a, ay, one));
3370: PetscCall(PetscLogGpuFlops(bnz));
3371: PetscCall(PetscLogGpuTimeEnd());
3372: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
3373: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3374: PetscFunctionReturn(PETSC_SUCCESS);
3375: }
3377: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
3378: {
3379: PetscBool both = PETSC_FALSE;
3380: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3382: PetscFunctionBegin;
3383: if (A->factortype == MAT_FACTOR_NONE) {
3384: Mat_SeqAIJHIPSPARSE *spptr = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3385: if (spptr->mat) {
3386: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3387: if (matrix->values) {
3388: both = PETSC_TRUE;
3389: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3390: }
3391: }
3392: if (spptr->matTranspose) {
3393: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3394: if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
3395: }
3396: }
3397: //PetscCall(MatZeroEntries_SeqAIJ(A));
3398: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3399: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3400: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3401: else A->offloadmask = PETSC_OFFLOAD_CPU;
3402: PetscFunctionReturn(PETSC_SUCCESS);
3403: }
3405: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
3406: {
3407: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3409: PetscFunctionBegin;
3410: if (A->factortype != MAT_FACTOR_NONE) {
3411: A->boundtocpu = flg;
3412: PetscFunctionReturn(PETSC_SUCCESS);
3413: }
3414: if (flg) {
3415: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
3417: A->ops->scale = MatScale_SeqAIJ;
3418: A->ops->axpy = MatAXPY_SeqAIJ;
3419: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3420: A->ops->mult = MatMult_SeqAIJ;
3421: A->ops->multadd = MatMultAdd_SeqAIJ;
3422: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3423: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3424: A->ops->multhermitiantranspose = NULL;
3425: A->ops->multhermitiantransposeadd = NULL;
3426: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3427: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3428: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3429: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
3430: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
3431: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3432: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3433: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
3434: } else {
3435: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
3436: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
3437: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
3438: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
3439: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
3440: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
3441: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
3442: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
3443: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
3444: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
3445: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
3446: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
3447: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
3448: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
3449: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
3450: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
3451: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
3452: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
3453: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3454: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3455: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
3456: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
3457: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
3458: }
3459: A->boundtocpu = flg;
3460: if (flg && a->inode.size) a->inode.use = PETSC_TRUE;
3461: else a->inode.use = PETSC_FALSE;
3462: PetscFunctionReturn(PETSC_SUCCESS);
3463: }
3465: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3466: {
3467: Mat B;
3469: PetscFunctionBegin;
3470: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
3471: if (reuse == MAT_INITIAL_MATRIX) {
3472: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
3473: } else if (reuse == MAT_REUSE_MATRIX) {
3474: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
3475: }
3476: B = *newmat;
3477: PetscCall(PetscFree(B->defaultvectype));
3478: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
3479: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3480: if (B->factortype == MAT_FACTOR_NONE) {
3481: Mat_SeqAIJHIPSPARSE *spptr;
3482: PetscCall(PetscNew(&spptr));
3483: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3484: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3485: spptr->format = MAT_HIPSPARSE_CSR;
3486: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3487: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
3488: #else
3489: spptr->spmvAlg = HIPSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3490: #endif
3491: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3492: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
3494: B->spptr = spptr;
3495: } else {
3496: Mat_SeqAIJHIPSPARSETriFactors *spptr;
3498: PetscCall(PetscNew(&spptr));
3499: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
3500: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
3501: B->spptr = spptr;
3502: }
3503: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3504: }
3505: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
3506: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
3507: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
3508: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
3509: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
3510: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
3512: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
3513: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
3514: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
3515: #if defined(PETSC_HAVE_HYPRE)
3516: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
3517: #endif
3518: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
3519: PetscFunctionReturn(PETSC_SUCCESS);
3520: }
3522: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
3523: {
3524: PetscFunctionBegin;
3525: PetscCall(MatCreate_SeqAIJ(B));
3526: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
3527: PetscFunctionReturn(PETSC_SUCCESS);
3528: }
3530: /*MC
3531: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
3533: A matrix type whose data resides on AMD GPUs. These matrices can be in either
3534: CSR, ELL, or Hybrid format.
3535: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
3537: Options Database Keys:
3538: + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
3539: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
3540: Other options include ell (ellpack) or hyb (hybrid).
3541: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
3542: - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
3544: Level: beginner
3546: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
3547: M*/
3549: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
3550: {
3551: PetscFunctionBegin;
3552: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
3553: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
3554: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
3555: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
3556: PetscFunctionReturn(PETSC_SUCCESS);
3557: }
3559: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
3560: {
3561: Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
3563: PetscFunctionBegin;
3564: if (cusp) {
3565: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
3566: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3567: delete cusp->workVector;
3568: delete cusp->rowoffsets_gpu;
3569: delete cusp->csr2csc_i;
3570: delete cusp->coords;
3571: if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
3572: PetscCall(PetscFree(mat->spptr));
3573: }
3574: PetscFunctionReturn(PETSC_SUCCESS);
3575: }
3577: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3578: {
3579: PetscFunctionBegin;
3580: if (*mat) {
3581: delete (*mat)->values;
3582: delete (*mat)->column_indices;
3583: delete (*mat)->row_offsets;
3584: delete *mat;
3585: *mat = 0;
3586: }
3587: PetscFunctionReturn(PETSC_SUCCESS);
3588: }
3590: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSETriFactorStruct **trifactor)
3591: {
3592: PetscFunctionBegin;
3593: if (*trifactor) {
3594: if ((*trifactor)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*trifactor)->descr));
3595: if ((*trifactor)->solveInfo) PetscCallHIPSPARSE(hipsparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3596: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3597: if ((*trifactor)->solveBuffer) PetscCallHIP(hipFree((*trifactor)->solveBuffer));
3598: if ((*trifactor)->AA_h) PetscCallHIP(hipHostFree((*trifactor)->AA_h));
3599: if ((*trifactor)->csr2cscBuffer) PetscCallHIP(hipFree((*trifactor)->csr2cscBuffer));
3600: PetscCall(PetscFree(*trifactor));
3601: }
3602: PetscFunctionReturn(PETSC_SUCCESS);
3603: }
3605: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
3606: {
3607: CsrMatrix *mat;
3609: PetscFunctionBegin;
3610: if (*matstruct) {
3611: if ((*matstruct)->mat) {
3612: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
3613: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
3614: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
3615: } else {
3616: mat = (CsrMatrix *)(*matstruct)->mat;
3617: PetscCall(CsrMatrix_Destroy(&mat));
3618: }
3619: }
3620: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
3621: delete (*matstruct)->cprowIndices;
3622: if ((*matstruct)->alpha_one) PetscCallHIP(hipFree((*matstruct)->alpha_one));
3623: if ((*matstruct)->beta_zero) PetscCallHIP(hipFree((*matstruct)->beta_zero));
3624: if ((*matstruct)->beta_one) PetscCallHIP(hipFree((*matstruct)->beta_one));
3626: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
3627: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
3628: for (int i = 0; i < 3; i++) {
3629: if (mdata->hipSpMV[i].initialized) {
3630: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
3631: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
3632: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
3633: }
3634: }
3635: delete *matstruct;
3636: *matstruct = NULL;
3637: }
3638: PetscFunctionReturn(PETSC_SUCCESS);
3639: }
3641: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
3642: {
3643: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
3645: PetscFunctionBegin;
3646: if (fs) {
3647: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3648: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3649: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3650: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3651: delete fs->rpermIndices;
3652: delete fs->cpermIndices;
3653: delete fs->workVector;
3654: fs->rpermIndices = NULL;
3655: fs->cpermIndices = NULL;
3656: fs->workVector = NULL;
3657: fs->init_dev_prop = PETSC_FALSE;
3658: #if PETSC_PKG_HIP_VERSION_GE(4, 5, 0)
3659: PetscCallHIP(hipFree(fs->csrRowPtr));
3660: PetscCallHIP(hipFree(fs->csrColIdx));
3661: PetscCallHIP(hipFree(fs->csrVal));
3662: PetscCallHIP(hipFree(fs->X));
3663: PetscCallHIP(hipFree(fs->Y));
3664: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3665: PetscCallHIP(hipFree(fs->spsvBuffer_L));
3666: PetscCallHIP(hipFree(fs->spsvBuffer_U));
3667: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
3668: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
3669: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
3670: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
3671: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
3672: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
3673: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3674: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
3675: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3676: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
3677: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
3678: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
3679: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
3681: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3682: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3683: #endif
3684: }
3685: PetscFunctionReturn(PETSC_SUCCESS);
3686: }
3688: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
3689: {
3690: hipsparseHandle_t handle;
3692: PetscFunctionBegin;
3693: if (*trifactors) {
3694: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
3695: if ((handle = (*trifactors)->handle)) PetscCallHIPSPARSE(hipsparseDestroy(handle));
3696: PetscCall(PetscFree(*trifactors));
3697: }
3698: PetscFunctionReturn(PETSC_SUCCESS);
3699: }
3701: struct IJCompare {
3702: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3703: {
3704: if (t1.get<0>() < t2.get<0>()) return true;
3705: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3706: return false;
3707: }
3708: };
3710: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3711: {
3712: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3714: PetscFunctionBegin;
3715: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3716: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3717: if (destroy) {
3718: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3719: delete cusp->csr2csc_i;
3720: cusp->csr2csc_i = NULL;
3721: }
3722: A->transupdated = PETSC_FALSE;
3723: PetscFunctionReturn(PETSC_SUCCESS);
3724: }
3726: static PetscErrorCode MatCOOStructDestroy_SeqAIJHIPSPARSE(void *data)
3727: {
3728: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
3730: PetscFunctionBegin;
3731: PetscCallHIP(hipFree(coo->perm));
3732: PetscCallHIP(hipFree(coo->jmap));
3733: PetscCall(PetscFree(coo));
3734: PetscFunctionReturn(PETSC_SUCCESS);
3735: }
3737: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3738: {
3739: PetscBool dev_ij = PETSC_FALSE;
3740: PetscMemType mtype = PETSC_MEMTYPE_HOST;
3741: PetscInt *i, *j;
3742: PetscContainer container_h, container_d;
3743: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
3745: PetscFunctionBegin;
3746: PetscCall(PetscGetMemType(coo_i, &mtype));
3747: if (PetscMemTypeDevice(mtype)) {
3748: dev_ij = PETSC_TRUE;
3749: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
3750: PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3751: PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
3752: } else {
3753: i = coo_i;
3754: j = coo_j;
3755: }
3756: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
3757: if (dev_ij) PetscCall(PetscFree2(i, j));
3758: mat->offloadmask = PETSC_OFFLOAD_CPU;
3759: // Create the GPU memory
3760: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mat));
3762: // Copy the COO struct to device
3763: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
3764: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
3765: PetscCall(PetscMalloc1(1, &coo_d));
3766: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
3767: PetscCallHIP(hipMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
3768: PetscCallHIP(hipMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
3769: PetscCallHIP(hipMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
3770: PetscCallHIP(hipMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), hipMemcpyHostToDevice));
3772: // Put the COO struct in a container and then attach that to the matrix
3773: PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
3774: PetscCall(PetscContainerSetPointer(container_d, coo_d));
3775: PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJHIPSPARSE));
3776: PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
3777: PetscCall(PetscContainerDestroy(&container_d));
3778: PetscFunctionReturn(PETSC_SUCCESS);
3779: }
3781: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
3782: {
3783: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
3784: const PetscCount grid_size = gridDim.x * blockDim.x;
3785: for (; i < nnz; i += grid_size) {
3786: PetscScalar sum = 0.0;
3787: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
3788: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
3789: }
3790: }
3792: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3793: {
3794: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
3795: Mat_SeqAIJHIPSPARSE *dev = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3796: PetscCount Annz = seq->nz;
3797: PetscMemType memtype;
3798: const PetscScalar *v1 = v;
3799: PetscScalar *Aa;
3800: PetscContainer container;
3801: MatCOOStruct_SeqAIJ *coo;
3803: PetscFunctionBegin;
3804: if (!dev->mat) PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3806: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
3807: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
3809: PetscCall(PetscGetMemType(v, &memtype));
3810: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
3811: PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
3812: PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
3813: }
3815: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa));
3816: else PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa));
3818: PetscCall(PetscLogGpuTimeBegin());
3819: if (Annz) {
3820: hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddCOOValues), dim3((Annz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, Annz, coo->jmap, coo->perm, imode, Aa);
3821: PetscCallHIP(hipPeekAtLastError());
3822: }
3823: PetscCall(PetscLogGpuTimeEnd());
3825: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
3826: else PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
3828: if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
3829: PetscFunctionReturn(PETSC_SUCCESS);
3830: }
3832: /*@C
3833: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3835: Not Collective
3837: Input Parameters:
3838: + A - the matrix
3839: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3841: Output Parameters:
3842: + i - the CSR row pointers
3843: - j - the CSR column indices
3845: Level: developer
3847: Note:
3848: When compressed is true, the CSR structure does not contain empty rows
3850: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3851: @*/
3852: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3853: {
3854: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3855: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3856: CsrMatrix *csr;
3858: PetscFunctionBegin;
3860: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
3861: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3862: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3863: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3864: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3865: csr = (CsrMatrix *)cusp->mat->mat;
3866: if (i) {
3867: if (!compressed && a->compressedrow.use) { /* need full row offset */
3868: if (!cusp->rowoffsets_gpu) {
3869: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3870: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3871: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3872: }
3873: *i = cusp->rowoffsets_gpu->data().get();
3874: } else *i = csr->row_offsets->data().get();
3875: }
3876: if (j) *j = csr->column_indices->data().get();
3877: PetscFunctionReturn(PETSC_SUCCESS);
3878: }
3880: /*@C
3881: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3883: Not Collective
3885: Input Parameters:
3886: + A - the matrix
3887: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3888: . i - the CSR row pointers
3889: - j - the CSR column indices
3891: Level: developer
3893: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3894: @*/
3895: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3896: {
3897: PetscFunctionBegin;
3899: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3900: if (i) *i = NULL;
3901: if (j) *j = NULL;
3902: PetscFunctionReturn(PETSC_SUCCESS);
3903: }
3905: /*@C
3906: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3908: Not Collective
3910: Input Parameter:
3911: . A - a `MATSEQAIJHIPSPARSE` matrix
3913: Output Parameter:
3914: . a - pointer to the device data
3916: Level: developer
3918: Note:
3919: May trigger host-device copies if the up-to-date matrix data is on host
3921: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3922: @*/
3923: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3924: {
3925: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3926: CsrMatrix *csr;
3928: PetscFunctionBegin;
3930: PetscAssertPointer(a, 2);
3931: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3932: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3933: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3934: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3935: csr = (CsrMatrix *)cusp->mat->mat;
3936: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3937: *a = csr->values->data().get();
3938: PetscFunctionReturn(PETSC_SUCCESS);
3939: }
3941: /*@C
3942: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3944: Not Collective
3946: Input Parameters:
3947: + A - a `MATSEQAIJHIPSPARSE` matrix
3948: - a - pointer to the device data
3950: Level: developer
3952: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3953: @*/
3954: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3955: {
3956: PetscFunctionBegin;
3958: PetscAssertPointer(a, 2);
3959: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3960: *a = NULL;
3961: PetscFunctionReturn(PETSC_SUCCESS);
3962: }
3964: /*@C
3965: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3967: Not Collective
3969: Input Parameter:
3970: . A - a `MATSEQAIJHIPSPARSE` matrix
3972: Output Parameter:
3973: . a - pointer to the device data
3975: Level: developer
3977: Note:
3978: May trigger host-device copies if up-to-date matrix data is on host
3980: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3981: @*/
3982: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar **a)
3983: {
3984: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
3985: CsrMatrix *csr;
3987: PetscFunctionBegin;
3989: PetscAssertPointer(a, 2);
3990: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3991: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3992: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3993: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3994: csr = (CsrMatrix *)cusp->mat->mat;
3995: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
3996: *a = csr->values->data().get();
3997: A->offloadmask = PETSC_OFFLOAD_GPU;
3998: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
3999: PetscFunctionReturn(PETSC_SUCCESS);
4000: }
4001: /*@C
4002: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
4004: Not Collective
4006: Input Parameters:
4007: + A - a `MATSEQAIJHIPSPARSE` matrix
4008: - a - pointer to the device data
4010: Level: developer
4012: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
4013: @*/
4014: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar **a)
4015: {
4016: PetscFunctionBegin;
4018: PetscAssertPointer(a, 2);
4019: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4020: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4021: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4022: *a = NULL;
4023: PetscFunctionReturn(PETSC_SUCCESS);
4024: }
4026: /*@C
4027: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
4029: Not Collective
4031: Input Parameter:
4032: . A - a `MATSEQAIJHIPSPARSE` matrix
4034: Output Parameter:
4035: . a - pointer to the device data
4037: Level: developer
4039: Note:
4040: Does not trigger host-device copies and flags data validity on the GPU
4042: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
4043: @*/
4044: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4045: {
4046: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
4047: CsrMatrix *csr;
4049: PetscFunctionBegin;
4051: PetscAssertPointer(a, 2);
4052: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4053: PetscCheck(cusp->format != MAT_HIPSPARSE_ELL && cusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4054: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4055: csr = (CsrMatrix *)cusp->mat->mat;
4056: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing HIP memory");
4057: *a = csr->values->data().get();
4058: A->offloadmask = PETSC_OFFLOAD_GPU;
4059: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
4060: PetscFunctionReturn(PETSC_SUCCESS);
4061: }
4063: /*@C
4064: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
4066: Not Collective
4068: Input Parameters:
4069: + A - a `MATSEQAIJHIPSPARSE` matrix
4070: - a - pointer to the device data
4072: Level: developer
4074: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
4075: @*/
4076: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4077: {
4078: PetscFunctionBegin;
4080: PetscAssertPointer(a, 2);
4081: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4082: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4083: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4084: *a = NULL;
4085: PetscFunctionReturn(PETSC_SUCCESS);
4086: }
4088: struct IJCompare4 {
4089: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4090: {
4091: if (t1.get<0>() < t2.get<0>()) return true;
4092: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4093: return false;
4094: }
4095: };
4097: struct Shift {
4098: int _shift;
4100: Shift(int shift) : _shift(shift) { }
4101: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4102: };
4104: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4105: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4106: {
4107: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4108: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
4109: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
4110: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4111: PetscInt Annz, Bnnz;
4112: PetscInt i, m, n, zero = 0;
4114: PetscFunctionBegin;
4117: PetscAssertPointer(C, 4);
4118: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
4119: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
4120: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4121: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4122: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4123: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4124: if (reuse == MAT_INITIAL_MATRIX) {
4125: m = A->rmap->n;
4126: n = A->cmap->n + B->cmap->n;
4127: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4128: PetscCall(MatSetSizes(*C, m, n, m, n));
4129: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
4130: c = (Mat_SeqAIJ *)(*C)->data;
4131: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4132: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
4133: Ccsr = new CsrMatrix;
4134: Cmat->cprowIndices = NULL;
4135: c->compressedrow.use = PETSC_FALSE;
4136: c->compressedrow.nrows = 0;
4137: c->compressedrow.i = NULL;
4138: c->compressedrow.rindex = NULL;
4139: Ccusp->workVector = NULL;
4140: Ccusp->nrows = m;
4141: Ccusp->mat = Cmat;
4142: Ccusp->mat->mat = Ccsr;
4143: Ccsr->num_rows = m;
4144: Ccsr->num_cols = n;
4145: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
4146: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
4147: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4148: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4149: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4150: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4151: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4152: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4153: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4154: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4155: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4156: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4157: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4159: Acsr = (CsrMatrix *)Acusp->mat->mat;
4160: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4161: Annz = (PetscInt)Acsr->column_indices->size();
4162: Bnnz = (PetscInt)Bcsr->column_indices->size();
4163: c->nz = Annz + Bnnz;
4164: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4165: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4166: Ccsr->values = new THRUSTARRAY(c->nz);
4167: Ccsr->num_entries = c->nz;
4168: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4169: if (c->nz) {
4170: auto Acoo = new THRUSTINTARRAY32(Annz);
4171: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4172: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4173: THRUSTINTARRAY32 *Aroff, *Broff;
4175: if (a->compressedrow.use) { /* need full row offset */
4176: if (!Acusp->rowoffsets_gpu) {
4177: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4178: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4179: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4180: }
4181: Aroff = Acusp->rowoffsets_gpu;
4182: } else Aroff = Acsr->row_offsets;
4183: if (b->compressedrow.use) { /* need full row offset */
4184: if (!Bcusp->rowoffsets_gpu) {
4185: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4186: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4187: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4188: }
4189: Broff = Bcusp->rowoffsets_gpu;
4190: } else Broff = Bcsr->row_offsets;
4191: PetscCall(PetscLogGpuTimeBegin());
4192: PetscCallHIPSPARSE(hipsparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4193: PetscCallHIPSPARSE(hipsparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4194: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4195: auto Aperm = thrust::make_constant_iterator(1);
4196: auto Bperm = thrust::make_constant_iterator(0);
4197: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4198: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4199: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4200: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4201: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4202: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4203: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4204: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4205: auto p1 = Ccusp->coords->begin();
4206: auto p2 = Ccusp->coords->begin();
4207: thrust::advance(p2, Annz);
4208: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4209: auto cci = thrust::make_counting_iterator(zero);
4210: auto cce = thrust::make_counting_iterator(c->nz);
4211: #if 0 //Errors on SUMMIT cuda 11.1.0
4212: PetscCallThrust(thrust::partition_copy(thrust::device, cci, cce, wPerm->begin(), p1, p2, thrust::identity<int>()));
4213: #else
4214: auto pred = thrust::identity<int>();
4215: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4216: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4217: #endif
4218: PetscCallHIPSPARSE(hipsparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), HIPSPARSE_INDEX_BASE_ZERO));
4219: PetscCall(PetscLogGpuTimeEnd());
4220: delete wPerm;
4221: delete Acoo;
4222: delete Bcoo;
4223: delete Ccoo;
4224: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4226: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4227: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
4228: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
4229: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4230: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
4231: CsrMatrix *CcsrT = new CsrMatrix;
4232: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4233: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4235: (*C)->form_explicit_transpose = PETSC_TRUE;
4236: (*C)->transupdated = PETSC_TRUE;
4237: Ccusp->rowoffsets_gpu = NULL;
4238: CmatT->cprowIndices = NULL;
4239: CmatT->mat = CcsrT;
4240: CcsrT->num_rows = n;
4241: CcsrT->num_cols = m;
4242: CcsrT->num_entries = c->nz;
4243: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4244: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4245: CcsrT->values = new THRUSTARRAY(c->nz);
4247: PetscCall(PetscLogGpuTimeBegin());
4248: auto rT = CcsrT->row_offsets->begin();
4249: if (AT) {
4250: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4251: thrust::advance(rT, -1);
4252: }
4253: if (BT) {
4254: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4255: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4256: thrust::copy(titb, tite, rT);
4257: }
4258: auto cT = CcsrT->column_indices->begin();
4259: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4260: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4261: auto vT = CcsrT->values->begin();
4262: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4263: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4264: PetscCall(PetscLogGpuTimeEnd());
4266: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
4267: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
4268: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
4269: PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4270: PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4271: PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4272: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4273: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
4274: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
4276: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
4277: Ccusp->matTranspose = CmatT;
4278: }
4279: }
4281: c->singlemalloc = PETSC_FALSE;
4282: c->free_a = PETSC_TRUE;
4283: c->free_ij = PETSC_TRUE;
4284: PetscCall(PetscMalloc1(m + 1, &c->i));
4285: PetscCall(PetscMalloc1(c->nz, &c->j));
4286: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4287: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4288: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4289: ii = *Ccsr->row_offsets;
4290: jj = *Ccsr->column_indices;
4291: PetscCallHIP(hipMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4292: PetscCallHIP(hipMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4293: } else {
4294: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4295: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
4296: }
4297: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4298: PetscCall(PetscMalloc1(m, &c->ilen));
4299: PetscCall(PetscMalloc1(m, &c->imax));
4300: c->maxnz = c->nz;
4301: c->nonzerorowcnt = 0;
4302: c->rmax = 0;
4303: for (i = 0; i < m; i++) {
4304: const PetscInt nn = c->i[i + 1] - c->i[i];
4305: c->ilen[i] = c->imax[i] = nn;
4306: c->nonzerorowcnt += (PetscInt) !!nn;
4307: c->rmax = PetscMax(c->rmax, nn);
4308: }
4309: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4310: PetscCall(PetscMalloc1(c->nz, &c->a));
4311: (*C)->nonzerostate++;
4312: PetscCall(PetscLayoutSetUp((*C)->rmap));
4313: PetscCall(PetscLayoutSetUp((*C)->cmap));
4314: Ccusp->nonzerostate = (*C)->nonzerostate;
4315: (*C)->preallocated = PETSC_TRUE;
4316: } else {
4317: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4318: c = (Mat_SeqAIJ *)(*C)->data;
4319: if (c->nz) {
4320: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
4321: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4322: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4323: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4324: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
4325: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
4326: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4327: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
4328: Acsr = (CsrMatrix *)Acusp->mat->mat;
4329: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4330: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4331: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4332: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4333: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4334: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4335: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4336: auto pmid = Ccusp->coords->begin();
4337: thrust::advance(pmid, Acsr->num_entries);
4338: PetscCall(PetscLogGpuTimeBegin());
4339: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4340: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4341: thrust::for_each(zibait, zieait, VecHIPEquals());
4342: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4343: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4344: thrust::for_each(zibbit, ziebit, VecHIPEquals());
4345: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4346: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4347: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
4348: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4349: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4350: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4351: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4352: auto vT = CcsrT->values->begin();
4353: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4354: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4355: (*C)->transupdated = PETSC_TRUE;
4356: }
4357: PetscCall(PetscLogGpuTimeEnd());
4358: }
4359: }
4360: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4361: (*C)->assembled = PETSC_TRUE;
4362: (*C)->was_assembled = PETSC_FALSE;
4363: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4364: PetscFunctionReturn(PETSC_SUCCESS);
4365: }
4367: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4368: {
4369: bool dmem;
4370: const PetscScalar *av;
4372: PetscFunctionBegin;
4373: dmem = isHipMem(v);
4374: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(A, &av));
4375: if (n && idx) {
4376: THRUSTINTARRAY widx(n);
4377: widx.assign(idx, idx + n);
4378: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4380: THRUSTARRAY *w = NULL;
4381: thrust::device_ptr<PetscScalar> dv;
4382: if (dmem) dv = thrust::device_pointer_cast(v);
4383: else {
4384: w = new THRUSTARRAY(n);
4385: dv = w->data();
4386: }
4387: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4389: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4390: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4391: thrust::for_each(zibit, zieit, VecHIPEquals());
4392: if (w) PetscCallHIP(hipMemcpy(v, w->data().get(), n * sizeof(PetscScalar), hipMemcpyDeviceToHost));
4393: delete w;
4394: } else PetscCallHIP(hipMemcpy(v, av, n * sizeof(PetscScalar), dmem ? hipMemcpyDeviceToDevice : hipMemcpyDeviceToHost));
4396: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
4397: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(A, &av));
4398: PetscFunctionReturn(PETSC_SUCCESS);
4399: }