Actual source code: aijhipsparse.hip.cxx
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the HIPSPARSE library,
4: Portions of this code are under:
5: Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
6: */
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/mat/impls/dense/seq/dense.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
15: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
16: #include <thrust/adjacent_difference.h>
17: #include <thrust/iterator/transform_iterator.h>
18: #if PETSC_CPP_VERSION >= 14
19: #define PETSC_HAVE_THRUST_ASYNC 1
20: #include <thrust/async/for_each.h>
21: #endif
22: #include <thrust/iterator/constant_iterator.h>
23: #include <thrust/iterator/discard_iterator.h>
24: #include <thrust/binary_search.h>
25: #include <thrust/remove.h>
26: #include <thrust/sort.h>
27: #include <thrust/unique.h>
28: #include <thrust/gather.h>
30: const char *const MatHIPSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatHIPSPARSEStorageFormat", "MAT_HIPSPARSE_", 0};
31: const char *const MatHIPSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "SPMV_ALG_DEFAULT", "SPMV_COO_ALG1", "SPMV_COO_ALG2", "SPMV_CSR_ALG1", "SPMV_CSR_ALG2", "hipsparseSpMVAlg_t", "HIPSPARSE_", 0};
32: const char *const MatHIPSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "hipsparseSpMMAlg_t", "HIPSPARSE_SPMM_", 0};
33: //const char *const MatHIPSPARSECsr2CscAlgorithms[] = {"INVALID"/*HIPSPARSE does not have enum 0! We created one*/, "ALG1", "ALG2", "hipsparseCsr2CscAlg_t", "HIPSPARSE_CSR2CSC_", 0};
35: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
36: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, const MatFactorInfo *);
37: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat, Mat, const MatFactorInfo *);
38: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
39: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat, PetscOptionItems PetscOptionsObject);
40: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat, PetscScalar, Mat, MatStructure);
41: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat, PetscScalar);
42: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat, Vec, Vec);
43: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat, Vec, Vec);
44: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
45: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
46: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
47: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat, Vec, Vec);
48: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec);
49: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
51: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
52: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **, MatHIPSPARSEStorageFormat);
53: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **);
54: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat);
56: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat);
57: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat, PetscBool);
59: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
60: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
61: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat, const PetscScalar[], InsertMode);
63: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
65: // cusparseCreateCsr() separates types for row offsets and column indices in prototype, but requires them to have the same type at runtime!
66: const hipsparseIndexType_t csrRowOffsetsType = PetscDefined(USE_64BIT_INDICES) ? HIPSPARSE_INDEX_64I : HIPSPARSE_INDEX_32I;
67: const hipsparseIndexType_t csrColIndType = PetscDefined(USE_64BIT_INDICES) ? HIPSPARSE_INDEX_64I : HIPSPARSE_INDEX_32I;
69: using Csr2coo = Petsc::mat::aij::cupm::impl::Csr2coo;
70: using PetscIntToCInt = Petsc::mat::aij::cupm::impl::PetscIntToCInt;
72: /*
73: PetscErrorCode MatHIPSPARSESetStream(Mat A, const hipStream_t stream)
74: {
75: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
77: PetscFunctionBegin;
78: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
79: hipsparsestruct->stream = stream;
80: PetscCallHIPSPARSE(hipsparseSetStream(hipsparsestruct->handle, hipsparsestruct->stream));
81: PetscFunctionReturn(PETSC_SUCCESS);
82: }
84: PetscErrorCode MatHIPSPARSESetHandle(Mat A, const hipsparseHandle_t handle)
85: {
86: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
88: PetscFunctionBegin;
89: PetscCheck(hipsparsestruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
90: if (hipsparsestruct->handle != handle) {
91: if (hipsparsestruct->handle) PetscCallHIPSPARSE(hipsparseDestroy(hipsparsestruct->handle));
92: hipsparsestruct->handle = handle;
93: }
94: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
95: PetscFunctionReturn(PETSC_SUCCESS);
96: }
98: PetscErrorCode MatHIPSPARSEClearHandle(Mat A)
99: {
100: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE*)A->spptr;
101: PetscBool flg;
103: PetscFunctionBegin;
104: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
105: if (!flg || !hipsparsestruct) PetscFunctionReturn(PETSC_SUCCESS);
106: if (hipsparsestruct->handle) hipsparsestruct->handle = 0;
107: PetscFunctionReturn(PETSC_SUCCESS);
108: }
109: */
111: PETSC_INTERN PetscErrorCode MatHIPSPARSESetFormat_SeqAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
112: {
113: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
115: PetscFunctionBegin;
116: switch (op) {
117: case MAT_HIPSPARSE_MULT:
118: hipsparsestruct->format = format;
119: break;
120: case MAT_HIPSPARSE_ALL:
121: hipsparsestruct->format = format;
122: break;
123: default:
124: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. MAT_HIPSPARSE_MULT and MAT_HIPSPARSE_ALL are currently supported.", op);
125: }
126: PetscFunctionReturn(PETSC_SUCCESS);
127: }
129: /*@
130: MatHIPSPARSESetFormat - Sets the storage format of `MATSEQHIPSPARSE` matrices for a particular
131: operation. Only the `MatMult()` operation can use different GPU storage formats
133: Not Collective
135: Input Parameters:
136: + A - Matrix of type `MATSEQAIJHIPSPARSE`
137: . op - `MatHIPSPARSEFormatOperation`. `MATSEQAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT` and `MAT_HIPSPARSE_ALL`.
138: `MATMPIAIJHIPSPARSE` matrices support `MAT_HIPSPARSE_MULT_DIAG`, `MAT_HIPSPARSE_MULT_OFFDIAG`, and `MAT_HIPSPARSE_ALL`.
139: - format - `MatHIPSPARSEStorageFormat` (one of `MAT_HIPSPARSE_CSR`, `MAT_HIPSPARSE_ELL`, `MAT_HIPSPARSE_HYB`.)
141: Level: intermediate
143: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
144: @*/
145: PetscErrorCode MatHIPSPARSESetFormat(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
146: {
147: PetscFunctionBegin;
149: PetscTryMethod(A, "MatHIPSPARSESetFormat_C", (Mat, MatHIPSPARSEFormatOperation, MatHIPSPARSEStorageFormat), (A, op, format));
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: PETSC_INTERN PetscErrorCode MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE(Mat A, PetscBool use_cpu)
154: {
155: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
157: PetscFunctionBegin;
158: hipsparsestruct->use_cpu_solve = use_cpu;
159: PetscFunctionReturn(PETSC_SUCCESS);
160: }
162: /*@
163: MatHIPSPARSESetUseCPUSolve - Sets use CPU `MatSolve()`.
165: Input Parameters:
166: + A - Matrix of type `MATSEQAIJHIPSPARSE`
167: - use_cpu - set flag for using the built-in CPU `MatSolve()`
169: Level: intermediate
171: Notes:
172: The hipSparse LU solver currently computes the factors with the built-in CPU method
173: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
174: This method to specifies if the solve is done on the CPU or GPU (GPU is the default).
176: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJHIPSPARSE`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
177: @*/
178: PetscErrorCode MatHIPSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
179: {
180: PetscFunctionBegin;
182: PetscTryMethod(A, "MatHIPSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
183: PetscFunctionReturn(PETSC_SUCCESS);
184: }
186: static PetscErrorCode MatSetOption_SeqAIJHIPSPARSE(Mat A, MatOption op, PetscBool flg)
187: {
188: PetscFunctionBegin;
189: switch (op) {
190: case MAT_FORM_EXPLICIT_TRANSPOSE:
191: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
192: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
193: A->form_explicit_transpose = flg;
194: break;
195: default:
196: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
197: break;
198: }
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: static PetscErrorCode MatSetFromOptions_SeqAIJHIPSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
203: {
204: MatHIPSPARSEStorageFormat format;
205: PetscBool flg;
206: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
208: PetscFunctionBegin;
209: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJHIPSPARSE options");
210: if (A->factortype == MAT_FACTOR_NONE) {
211: PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
212: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT, format));
213: PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of (seq)aijhipsparse gpu matrices for SpMV and TriSolve", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparsestruct->format, (PetscEnum *)&format, &flg));
214: if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
215: PetscCall(PetscOptionsBool("-mat_hipsparse_use_cpu_solve", "Use CPU (I)LU solve", "MatHIPSPARSESetUseCPUSolve", hipsparsestruct->use_cpu_solve, &hipsparsestruct->use_cpu_solve, &flg));
216: if (flg) PetscCall(MatHIPSPARSESetUseCPUSolve(A, hipsparsestruct->use_cpu_solve));
217: PetscCall(
218: PetscOptionsEnum("-mat_hipsparse_spmv_alg", "sets hipSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "hipsparseSpMVAlg_t", MatHIPSPARSESpMVAlgorithms, (PetscEnum)hipsparsestruct->spmvAlg, (PetscEnum *)&hipsparsestruct->spmvAlg, &flg));
219: /* If user did use this option, check its consistency with hipSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatHIPSPARSESpMVAlgorithms[] */
220: PetscCheck(!flg || HIPSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
221: PetscCall(
222: PetscOptionsEnum("-mat_hipsparse_spmm_alg", "sets hipSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "hipsparseSpMMAlg_t", MatHIPSPARSESpMMAlgorithms, (PetscEnum)hipsparsestruct->spmmAlg, (PetscEnum *)&hipsparsestruct->spmmAlg, &flg));
223: PetscCheck(!flg || HIPSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
224: /*
225: PetscCall(PetscOptionsEnum("-mat_hipsparse_csr2csc_alg", "sets hipSPARSE algorithm used in converting CSR matrices to CSC matrices", "hipsparseCsr2CscAlg_t", MatHIPSPARSECsr2CscAlgorithms, (PetscEnum)hipsparsestruct->csr2cscAlg, (PetscEnum*)&hipsparsestruct->csr2cscAlg, &flg));
226: PetscCheck(!flg || HIPSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE enum hipsparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
227: */
228: }
229: PetscOptionsHeadEnd();
230: PetscFunctionReturn(PETSC_SUCCESS);
231: }
233: static PetscErrorCode MatSeqAIJHIPSPARSEBuildFactoredMatrix_LU(Mat A)
234: {
235: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
236: PetscInt m = A->rmap->n;
237: Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
238: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
239: const MatScalar *Aa = a->a;
240: PetscInt *Mi, *Mj, Mnz;
241: PetscScalar *Ma;
243: PetscFunctionBegin;
244: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
245: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
246: if (!fs->csrRowPtr) { // Is this the first time we are doing setup? Use csrRowPtr since it is not null even when m=0
247: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
248: Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
249: PetscCall(PetscMalloc1(m + 1, &Mi));
250: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
251: PetscCall(PetscMalloc1(Mnz, &Ma));
252: Mi[0] = 0;
253: for (PetscInt i = 0; i < m; i++) {
254: PetscInt llen = Ai[i + 1] - Ai[i];
255: PetscInt ulen = adiag[i] - adiag[i + 1];
256: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
257: Mj[Mi[i] + llen] = i; // diagonal entry
258: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
259: Mi[i + 1] = Mi[i] + llen + ulen;
260: }
261: // Copy M (L,U) from host to device
262: PetscCallHIP(hipMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
263: PetscCallHIP(hipMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
264: PetscCallHIP(hipMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
265: PetscCallHIP(hipMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), hipMemcpyHostToDevice));
266: PetscCallHIP(hipMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, hipMemcpyHostToDevice));
268: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
269: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
270: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
271: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
272: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
273: hipsparseFillMode_t fillMode = HIPSPARSE_FILL_MODE_LOWER;
274: hipsparseDiagType_t diagType = HIPSPARSE_DIAG_TYPE_UNIT;
276: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
277: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
280: fillMode = HIPSPARSE_FILL_MODE_UPPER;
281: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
282: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
283: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
284: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
286: // Allocate work vectors in SpSv
287: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(*fs->X) * m));
288: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
290: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
291: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
293: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = HIPSPARSE_OPERATION_NON_TRANSPOSE
294: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
295: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
296: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
297: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
298: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
299: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
301: // Record for reuse
302: fs->csrRowPtr_h = Mi;
303: fs->csrVal_h = Ma;
304: PetscCall(PetscFree(Mj));
305: }
306: // Copy the value
307: Mi = fs->csrRowPtr_h;
308: Ma = fs->csrVal_h;
309: Mnz = Mi[m];
310: for (PetscInt i = 0; i < m; i++) {
311: PetscInt llen = Ai[i + 1] - Ai[i];
312: PetscInt ulen = adiag[i] - adiag[i + 1];
313: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
314: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
315: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
316: }
317: PetscCallHIP(hipMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, hipMemcpyHostToDevice));
319: {
320: // Do hipsparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
321: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
323: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
324: fs->updatedSpSVAnalysis = PETSC_TRUE;
325: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
326: }
327: }
328: PetscFunctionReturn(PETSC_SUCCESS);
329: }
331: static PetscErrorCode MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(Mat A)
332: {
333: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
334: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
335: IS isrow = a->row, isicol = a->icol;
336: PetscBool row_identity, col_identity;
337: PetscInt n = A->rmap->n;
339: PetscFunctionBegin;
340: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
341: PetscCall(MatSeqAIJHIPSPARSEBuildFactoredMatrix_LU(A));
343: hipsparseTriFactors->nnz = a->nz;
345: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
346: /* lower triangular indices */
347: PetscCall(ISIdentity(isrow, &row_identity));
348: if (!row_identity && !hipsparseTriFactors->rpermIndices) {
349: const PetscInt *r;
351: PetscCall(ISGetIndices(isrow, &r));
352: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
353: hipsparseTriFactors->rpermIndices->assign(r, r + n);
354: PetscCall(ISRestoreIndices(isrow, &r));
355: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
356: }
358: /* upper triangular indices */
359: PetscCall(ISIdentity(isicol, &col_identity));
360: if (!col_identity && !hipsparseTriFactors->cpermIndices) {
361: const PetscInt *c;
363: PetscCall(ISGetIndices(isicol, &c));
364: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
365: hipsparseTriFactors->cpermIndices->assign(c, c + n);
366: PetscCall(ISRestoreIndices(isicol, &c));
367: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
368: }
369: PetscFunctionReturn(PETSC_SUCCESS);
370: }
372: static PetscErrorCode MatSeqAIJHIPSPARSEBuildFactoredMatrix_Cholesky(Mat A)
373: {
374: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
375: PetscInt m = A->rmap->n;
376: Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
377: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
378: const MatScalar *Aa = a->a;
379: PetscInt *Mj, Mnz;
380: PetscScalar *Ma, *D;
382: PetscFunctionBegin;
383: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
384: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
385: if (!fs->csrRowPtr) { // Is this the first time we are doing setup? Use csrRowPtr since it is not null even m=0
386: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
387: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
388: Mnz = Ai[m]; // Unz (with the unit diagonal)
389: PetscCall(PetscMalloc1(Mnz, &Ma));
390: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
391: PetscCall(PetscMalloc1(m, &D)); // the diagonal
392: for (PetscInt i = 0; i < m; i++) {
393: PetscInt ulen = Ai[i + 1] - Ai[i];
394: Mj[Ai[i]] = i; // diagonal entry
395: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
396: }
397: // Copy M (U) from host to device
398: PetscCallHIP(hipMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
399: PetscCallHIP(hipMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
400: PetscCallHIP(hipMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
401: PetscCallHIP(hipMalloc(&fs->diag, sizeof(*fs->diag) * m));
402: PetscCallHIP(hipMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), hipMemcpyHostToDevice));
403: PetscCallHIP(hipMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, hipMemcpyHostToDevice));
405: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
406: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
407: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
408: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
409: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
410: hipsparseFillMode_t fillMode = HIPSPARSE_FILL_MODE_UPPER;
411: hipsparseDiagType_t diagType = HIPSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
413: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
414: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
415: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
417: // Allocate work vectors in SpSv
418: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(*fs->X) * m));
419: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
421: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
422: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
424: // Query buffer sizes for SpSV and then allocate buffers
425: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
426: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
427: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
429: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
430: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
431: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
433: // Record for reuse
434: fs->csrVal_h = Ma;
435: fs->diag_h = D;
436: PetscCall(PetscFree(Mj));
437: }
438: // Copy the value
439: Ma = fs->csrVal_h;
440: D = fs->diag_h;
441: Mnz = Ai[m];
442: for (PetscInt i = 0; i < m; i++) {
443: D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
444: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
445: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
446: }
447: PetscCallHIP(hipMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, hipMemcpyHostToDevice));
448: PetscCallHIP(hipMemcpy(fs->diag, D, sizeof(*D) * m, hipMemcpyHostToDevice));
450: {
451: // Do hipsparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
452: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
453: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
454: fs->updatedSpSVAnalysis = PETSC_TRUE;
455: }
456: }
457: PetscFunctionReturn(PETSC_SUCCESS);
458: }
460: // Solve Ut D U x = b
461: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_Cholesky(Mat A, Vec b, Vec x)
462: {
463: Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
464: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
465: const PetscScalar *barray;
466: PetscScalar *xarray;
467: thrust::device_ptr<const PetscScalar> bGPU;
468: thrust::device_ptr<PetscScalar> xGPU;
469: const hipsparseSpSVAlg_t alg = HIPSPARSE_SPSV_ALG_DEFAULT;
470: PetscInt m = A->rmap->n;
472: PetscFunctionBegin;
473: PetscCall(PetscLogGpuTimeBegin());
474: PetscCall(VecHIPGetArrayWrite(x, &xarray));
475: PetscCall(VecHIPGetArrayRead(b, &barray));
476: xGPU = thrust::device_pointer_cast(xarray);
477: bGPU = thrust::device_pointer_cast(barray);
479: // Reorder b with the row permutation if needed, and wrap the result in fs->X
480: if (fs->rpermIndices)
481: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
483: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));
485: // Solve Ut Y = X
486: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
487: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
488: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut));
489: #else
490: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
491: #endif
493: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
494: // It is basically a vector element-wise multiplication, but cublas does not have it!
495: auto multiplies = thrust::multiplies<PetscScalar>();
496: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));
498: // Solve U X = Y
499: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray)); // if need to permute, we need to use the intermediate buffer X
501: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
502: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U));
503: #else
504: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U, fs->spsvBuffer_U));
505: #endif
507: // Reorder X with the column permutation if needed, and put the result back to x
508: if (fs->cpermIndices)
509: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
510: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
512: PetscCall(VecHIPRestoreArrayRead(b, &barray));
513: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
514: PetscCall(PetscLogGpuTimeEnd());
515: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
516: PetscFunctionReturn(PETSC_SUCCESS);
517: }
519: static PetscErrorCode MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(Mat A)
520: {
521: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
522: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
523: IS ip = a->row;
524: PetscBool perm_identity;
525: PetscInt n = A->rmap->n;
527: PetscFunctionBegin;
528: PetscCheck(hipsparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing hipsparseTriFactors");
530: PetscCall(MatSeqAIJHIPSPARSEBuildFactoredMatrix_Cholesky(A));
531: hipsparseTriFactors->nnz = (a->nz - n) * 2 + n;
533: A->offloadmask = PETSC_OFFLOAD_BOTH;
535: /* lower triangular indices */
536: PetscCall(ISIdentity(ip, &perm_identity));
537: if (!perm_identity) {
538: IS iip;
539: const PetscInt *irip, *rip;
541: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
542: PetscCall(ISGetIndices(iip, &irip));
543: PetscCall(ISGetIndices(ip, &rip));
544: hipsparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545: hipsparseTriFactors->rpermIndices->assign(rip, rip + n);
546: hipsparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
547: hipsparseTriFactors->cpermIndices->assign(irip, irip + n);
548: PetscCall(ISRestoreIndices(iip, &irip));
549: PetscCall(ISDestroy(&iip));
550: PetscCall(ISRestoreIndices(ip, &rip));
551: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
552: }
553: PetscFunctionReturn(PETSC_SUCCESS);
554: }
556: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
557: {
558: PetscFunctionBegin;
559: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
560: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
561: B->offloadmask = PETSC_OFFLOAD_CPU;
562: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_Cholesky;
563: B->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_Cholesky; // since symmetric
564: B->ops->matsolve = NULL;
565: B->ops->matsolvetranspose = NULL;
566: /* get the triangular factors */
567: PetscCall(MatSeqAIJHIPSPARSEICCAnalysisAndCopyToGPU(B));
568: PetscFunctionReturn(PETSC_SUCCESS);
569: }
571: static PetscErrorCode MatSeqAIJHIPSPARSEFormExplicitTranspose(Mat A)
572: {
573: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
574: Mat_SeqAIJHIPSPARSEMultStruct *matstruct, *matstructT;
575: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
576: hipsparseIndexBase_t indexBase;
578: PetscFunctionBegin;
579: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
580: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
581: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
582: matstructT = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
583: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
584: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
585: PetscCall(PetscLogEventBegin(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
586: PetscCall(PetscLogGpuTimeBegin());
587: if (hipsparsestruct->format != MAT_HIPSPARSE_CSR) PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
588: if (!hipsparsestruct->matTranspose) { /* create hipsparse matrix */
589: matstructT = new Mat_SeqAIJHIPSPARSEMultStruct;
590: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstructT->descr));
591: indexBase = hipsparseGetMatIndexBase(matstruct->descr);
592: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstructT->descr, indexBase));
593: PetscCallHIPSPARSE(hipsparseSetMatType(matstructT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
595: /* set alpha and beta */
596: PetscCallHIP(hipMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
597: PetscCallHIP(hipMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
598: PetscCallHIP(hipMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
599: PetscCallHIP(hipMemcpy(matstructT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
600: PetscCallHIP(hipMemcpy(matstructT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
601: PetscCallHIP(hipMemcpy(matstructT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
603: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
604: CsrMatrix *matrixT = new CsrMatrix;
605: matstructT->mat = matrixT;
606: matrixT->num_rows = A->cmap->n;
607: matrixT->num_cols = A->rmap->n;
608: matrixT->num_entries = a->nz;
609: matrixT->row_offsets = new THRUSTINTARRAY(matrixT->num_rows + 1);
610: matrixT->column_indices = new THRUSTINTARRAY(a->nz);
611: matrixT->values = new THRUSTARRAY(a->nz);
613: if (!hipsparsestruct->rowoffsets_gpu) hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
614: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
615: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), csrRowOffsetsType, csrColIndType, indexBase, hipsparse_scalartype));
616: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
617: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_HIPSPARSE_ELL and MAT_HIPSPARSE_HYB are not supported");
618: }
619: }
620: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* transpose mat struct may be already present, update data */
621: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
622: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
623: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
624: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
625: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
626: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
627: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
628: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
629: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
630: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
631: if (!hipsparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
632: hipsparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
633: hipsparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
634: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
635: }
636: if (!hipsparsestruct->csr2csc_i) { // not using hipsparseCsr2cscEx2() because it requires 32-bit indices
637: THRUSTINTARRAY row_indices(matrix->num_entries);
639: // Transpose the matrix via COO, i.e., by putting the row indices in column_indices[] and the column indices in row_indices[]
640: hipsparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); // will store the matrix to matrixT permutation, i.e., entry matrixT[i] is matrix[csr2csc_i[i]]
641: PetscCallThrust(thrust::sequence(thrust::device, hipsparsestruct->csr2csc_i->begin(), hipsparsestruct->csr2csc_i->end()));
642: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->rmap->n), Csr2coo(hipsparsestruct->rowoffsets_gpu->data().get(), matrixT->column_indices->data().get())));
643: row_indices = *matrix->column_indices;
644: // Sort the COO by row then column, and get the permutation csr2csc_i[]
645: PetscCallThrust(thrust::sort_by_key(thrust::device, thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), matrixT->column_indices->begin())), thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), matrixT->column_indices->end())),
646: hipsparsestruct->csr2csc_i->begin()));
647: // Finalize matrixT's row_offsets by looking up row_indices[]
648: PetscCallThrust(thrust::lower_bound(thrust::device, row_indices.begin(), row_indices.end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->cmap->n + 1), matrixT->row_offsets->begin()));
649: }
650: PetscCallThrust(thrust::gather(thrust::device, hipsparsestruct->csr2csc_i->begin(), hipsparsestruct->csr2csc_i->end(), matrix->values->begin(), matrixT->values->begin()));
651: }
652: PetscCall(PetscLogGpuTimeEnd());
653: PetscCall(PetscLogEventEnd(MAT_HIPSPARSEGenerateTranspose, A, 0, 0, 0));
654: /* the compressed row indices is not used for matTranspose */
655: matstructT->cprowIndices = NULL;
656: /* assign the pointer */
657: ((Mat_SeqAIJHIPSPARSE *)A->spptr)->matTranspose = matstructT;
658: A->transupdated = PETSC_TRUE;
659: PetscFunctionReturn(PETSC_SUCCESS);
660: }
662: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_LU(Mat A, Vec b, Vec x)
663: {
664: const PetscScalar *barray;
665: PetscScalar *xarray;
666: thrust::device_ptr<const PetscScalar> bGPU;
667: thrust::device_ptr<PetscScalar> xGPU;
668: Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
669: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
670: const hipsparseOperation_t op = HIPSPARSE_OPERATION_NON_TRANSPOSE;
671: const hipsparseSpSVAlg_t alg = HIPSPARSE_SPSV_ALG_DEFAULT;
672: PetscInt m = A->rmap->n;
674: PetscFunctionBegin;
675: PetscCall(PetscLogGpuTimeBegin());
676: PetscCall(VecHIPGetArrayWrite(x, &xarray));
677: PetscCall(VecHIPGetArrayRead(b, &barray));
678: xGPU = thrust::device_pointer_cast(xarray);
679: bGPU = thrust::device_pointer_cast(barray);
681: // Reorder b with the row permutation if needed, and wrap the result in fs->X
682: if (fs->rpermIndices)
683: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
685: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));
687: // Solve L Y = X
688: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
689: // Note that hipsparseSpSV_solve() secretly uses the external buffer used in hipsparseSpSV_analysis()!
691: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
692: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_L));
693: #else
694: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_L, fs->spsvBuffer_L));
695: #endif
697: // Solve U X = Y
698: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray));
700: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
701: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U));
702: #else
703: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, op, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_U, fs->spsvBuffer_U));
704: #endif
706: // Reorder X with the column permutation if needed, and put the result back to x
707: if (fs->cpermIndices)
708: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
709: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
710: PetscCall(VecHIPRestoreArrayRead(b, &barray));
711: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
712: PetscCall(PetscLogGpuTimeEnd());
713: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
714: PetscFunctionReturn(PETSC_SUCCESS);
715: }
717: static PetscErrorCode MatSolveTranspose_SeqAIJHIPSPARSE_LU(Mat A, Vec b, Vec x)
718: {
719: Mat_SeqAIJHIPSPARSETriFactors *fs = static_cast<Mat_SeqAIJHIPSPARSETriFactors *>(A->spptr);
720: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
721: const PetscScalar *barray;
722: PetscScalar *xarray;
723: thrust::device_ptr<const PetscScalar> bGPU;
724: thrust::device_ptr<PetscScalar> xGPU;
725: const hipsparseOperation_t opA = HIPSPARSE_OPERATION_TRANSPOSE;
726: const hipsparseSpSVAlg_t alg = HIPSPARSE_SPSV_ALG_DEFAULT;
727: PetscInt m = A->rmap->n;
729: PetscFunctionBegin;
730: PetscCall(PetscLogGpuTimeBegin());
731: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
732: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
733: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
734: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
736: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Ut));
737: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
738: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
739: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
740: fs->createdTransposeSpSVDescr = PETSC_TRUE;
741: }
743: if (!fs->updatedTransposeSpSVAnalysis) {
744: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
746: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
747: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
748: }
750: PetscCall(VecHIPGetArrayWrite(x, &xarray));
751: PetscCall(VecHIPGetArrayRead(b, &barray));
752: xGPU = thrust::device_pointer_cast(xarray);
753: bGPU = thrust::device_pointer_cast(barray);
755: // Reorder b with the row permutation if needed, and wrap the result in fs->X
756: if (fs->rpermIndices)
757: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
759: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->rpermIndices ? fs->X : (void *)barray));
761: // Solve Ut Y = X
762: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
763: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
764: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut));
765: #else
766: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
767: #endif
769: // Solve Lt X = Y
770: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, fs->cpermIndices ? fs->X : xarray)); // if need to permute, we need to use the intermediate buffer X
772: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
773: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_Lt));
774: #else
775: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, opA, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
776: #endif
778: // Reorder X with the column permutation if needed, and put the result back to x
779: if (fs->cpermIndices)
780: PetscCallThrust(thrust::copy(thrust::hip::par.on(PetscDefaultHipStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
781: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
783: PetscCall(VecHIPRestoreArrayRead(b, &barray));
784: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
785: PetscCall(PetscLogGpuTimeEnd());
786: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
787: PetscFunctionReturn(PETSC_SUCCESS);
788: }
790: static PetscErrorCode MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
791: {
792: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
793: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
794: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
795: CsrMatrix *Acsr;
796: PetscInt m, nz;
797: PetscBool flg;
799: PetscFunctionBegin;
800: if (PetscDefined(USE_DEBUG)) {
801: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
802: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
803: }
805: /* Copy A's value to fact */
806: m = fact->rmap->n;
807: nz = aij->nz;
808: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
809: Acsr = (CsrMatrix *)Acusp->mat->mat;
810: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
812: PetscCall(PetscLogGpuTimeBegin());
813: /* Factorize fact inplace */
814: if (m)
815: PetscCallHIPSPARSE(hipsparseXcsrilu02(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
816: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
817: if (PetscDefined(USE_DEBUG)) {
818: int numerical_zero;
819: hipsparseStatus_t status;
820: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
821: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
822: }
824: {
825: /* hipsparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after hipsparseXcsrilu02()
826: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
827: */
828: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
830: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
832: fs->updatedSpSVAnalysis = PETSC_TRUE;
833: /* L, U values have changed, reset the flag to indicate we need to redo hipsparseSpSV_analysis() for transpose solve */
834: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
835: }
837: fact->offloadmask = PETSC_OFFLOAD_GPU;
838: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but hipsparseSpSV_solve() supports both 32 and 64. The info is encoded in hipsparseSpMatDescr_t.
839: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_LU;
840: fact->ops->matsolve = NULL;
841: fact->ops->matsolvetranspose = NULL;
842: PetscCall(PetscLogGpuTimeEnd());
843: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
844: PetscFunctionReturn(PETSC_SUCCESS);
845: }
847: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
848: {
849: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
850: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
851: PetscInt m, nz;
853: PetscFunctionBegin;
854: if (PetscDefined(USE_DEBUG)) {
855: PetscBool flg, diagDense;
857: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
858: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
859: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
860: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
861: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
862: }
864: /* Free the old stale stuff */
865: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
867: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
868: but they will not be used. Allocate them just for easy debugging.
869: */
870: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
872: fact->offloadmask = PETSC_OFFLOAD_BOTH;
873: fact->factortype = MAT_FACTOR_ILU;
874: fact->info.factor_mallocs = 0;
875: fact->info.fill_ratio_given = info->fill;
876: fact->info.fill_ratio_needed = 1.0;
878: aij->row = NULL;
879: aij->col = NULL;
881: /* ====================================================================== */
882: /* Copy A's i, j to fact and also allocate the value array of fact. */
883: /* We'll do in-place factorization on fact */
884: /* ====================================================================== */
885: const PetscInt *Ai, *Aj;
887: m = fact->rmap->n;
888: nz = aij->nz;
890: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
891: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
892: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
893: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed
895: PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
896: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
897: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));
899: /* ====================================================================== */
900: /* Create descriptors for M, L, U */
901: /* ====================================================================== */
902: hipsparseFillMode_t fillMode;
903: hipsparseDiagType_t diagType;
905: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
906: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
907: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
909: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
910: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
911: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
912: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
913: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
914: */
915: fillMode = HIPSPARSE_FILL_MODE_LOWER;
916: diagType = HIPSPARSE_DIAG_TYPE_UNIT;
917: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
918: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
919: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
921: fillMode = HIPSPARSE_FILL_MODE_UPPER;
922: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
923: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
924: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
925: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_U, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
927: /* ========================================================================= */
928: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
929: /* ========================================================================= */
930: PetscCallHIPSPARSE(hipsparseCreateCsrilu02Info(&fs->ilu0Info_M));
931: if (m)
932: PetscCallHIPSPARSE(hipsparseXcsrilu02_bufferSize(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
933: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
935: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
936: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
938: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
939: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
941: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
942: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
944: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_U));
945: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
947: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
948: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
949: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
950: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
951: */
952: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
953: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
954: fs->spsvBuffer_L = fs->factBuffer_M;
955: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
956: } else {
957: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
958: fs->spsvBuffer_U = fs->factBuffer_M;
959: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
960: }
962: /* ========================================================================== */
963: /* Perform analysis of ilu0 on M, SpSv on L and U */
964: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
965: /* ========================================================================== */
966: int structural_zero;
967: hipsparseStatus_t status;
969: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
970: if (m)
971: PetscCallHIPSPARSE(hipsparseXcsrilu02_analysis(fs->handle, m, nz, /* hipsparseXcsrilu02 errors out with empty matrices (m=0) */
972: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
973: if (PetscDefined(USE_DEBUG)) {
974: /* hipsparseXcsrilu02_zeroPivot() is a blocking call. It calls hipDeviceSynchronize() to make sure all previous kernels are done. */
975: status = hipsparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
976: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
977: }
979: /* Estimate FLOPs of the numeric factorization */
980: {
981: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
982: PetscInt *Ai, nzRow, nzLeft;
983: const PetscInt *adiag;
984: PetscLogDouble flops = 0.0;
986: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
987: Ai = Aseq->i;
988: for (PetscInt i = 0; i < m; i++) {
989: if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
990: nzRow = Ai[i + 1] - Ai[i];
991: nzLeft = adiag[i] - Ai[i];
992: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
993: and include the eliminated one will be updated, which incurs a multiplication and an addition.
994: */
995: nzLeft = (nzRow - 1) / 2;
996: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
997: }
998: }
999: fs->numericFactFlops = flops;
1000: }
1001: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJHIPSPARSE_ILU0;
1002: PetscFunctionReturn(PETSC_SUCCESS);
1003: }
1005: static PetscErrorCode MatSolve_SeqAIJHIPSPARSE_ICC0(Mat fact, Vec b, Vec x)
1006: {
1007: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1008: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1009: const PetscScalar *barray;
1010: PetscScalar *xarray;
1012: PetscFunctionBegin;
1013: PetscCall(VecHIPGetArrayWrite(x, &xarray));
1014: PetscCall(VecHIPGetArrayRead(b, &barray));
1015: PetscCall(PetscLogGpuTimeBegin());
1017: /* Solve L*y = b */
1018: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1019: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1020: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1021: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1022: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1023: #else
1024: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1025: fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1026: #endif
1027: /* Solve Lt*x = y */
1028: PetscCallHIPSPARSE(hipsparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1029: #if PETSC_PKG_HIP_VERSION_EQ(5, 6, 0) || PETSC_PKG_HIP_VERSION_GE(6, 0, 0)
1030: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1031: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1032: #else
1033: PetscCallHIPSPARSE(hipsparseSpSV_solve(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1034: fs->dnVecDescr_Y, fs->dnVecDescr_X, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1035: #endif
1036: PetscCall(VecHIPRestoreArrayRead(b, &barray));
1037: PetscCall(VecHIPRestoreArrayWrite(x, &xarray));
1039: PetscCall(PetscLogGpuTimeEnd());
1040: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1041: PetscFunctionReturn(PETSC_SUCCESS);
1042: }
1044: static PetscErrorCode MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1045: {
1046: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1047: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1048: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1049: CsrMatrix *Acsr;
1050: PetscInt m, nz;
1051: PetscBool flg;
1053: PetscFunctionBegin;
1054: if (PetscDefined(USE_DEBUG)) {
1055: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1056: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1057: }
1059: /* Copy A's value to fact */
1060: m = fact->rmap->n;
1061: nz = aij->nz;
1062: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1063: Acsr = (CsrMatrix *)Acusp->mat->mat;
1064: PetscCallHIP(hipMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, hipMemcpyDeviceToDevice, PetscDefaultHipStream));
1066: /* Factorize fact inplace */
1067: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1068: csric02() only takes the lower triangular part of matrix A to perform factorization.
1069: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1070: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1071: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1072: */
1073: if (m) PetscCallHIPSPARSE(hipsparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1074: if (PetscDefined(USE_DEBUG)) {
1075: int numerical_zero;
1076: hipsparseStatus_t status;
1077: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1078: PetscAssert(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1079: }
1081: {
1082: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1084: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1085: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1086: */
1087: PetscCallHIPSPARSE(hipsparseSpSV_analysis(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1088: fs->updatedSpSVAnalysis = PETSC_TRUE;
1089: }
1091: fact->offloadmask = PETSC_OFFLOAD_GPU;
1092: fact->ops->solve = MatSolve_SeqAIJHIPSPARSE_ICC0;
1093: fact->ops->solvetranspose = MatSolve_SeqAIJHIPSPARSE_ICC0;
1094: fact->ops->matsolve = NULL;
1095: fact->ops->matsolvetranspose = NULL;
1096: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1097: PetscFunctionReturn(PETSC_SUCCESS);
1098: }
1100: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1101: {
1102: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)fact->spptr;
1103: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1104: PetscInt m, nz;
1106: PetscFunctionBegin;
1107: if (PetscDefined(USE_DEBUG)) {
1108: PetscBool flg, diagDense;
1110: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1111: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJHIPSPARSE, but input is %s", ((PetscObject)A)->type_name);
1112: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1113: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1114: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1115: }
1117: /* Free the old stale stuff */
1118: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&fs));
1120: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1121: but they will not be used. Allocate them just for easy debugging.
1122: */
1123: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1125: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1126: fact->factortype = MAT_FACTOR_ICC;
1127: fact->info.factor_mallocs = 0;
1128: fact->info.fill_ratio_given = info->fill;
1129: fact->info.fill_ratio_needed = 1.0;
1131: aij->row = NULL;
1132: aij->col = NULL;
1134: /* ====================================================================== */
1135: /* Copy A's i, j to fact and also allocate the value array of fact. */
1136: /* We'll do in-place factorization on fact */
1137: /* ====================================================================== */
1138: const PetscInt *Ai, *Aj;
1140: m = fact->rmap->n;
1141: nz = aij->nz;
1143: PetscCallHIP(hipMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1144: PetscCallHIP(hipMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1145: PetscCallHIP(hipMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1146: PetscCall(MatSeqAIJHIPSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed
1148: PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
1149: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
1150: PetscCallThrust(thrust::transform(thrust::hip::par.on(PetscDefaultHipStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));
1152: /* ====================================================================== */
1153: /* Create mat descriptors for M, L */
1154: /* ====================================================================== */
1155: hipsparseFillMode_t fillMode;
1156: hipsparseDiagType_t diagType;
1158: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&fs->matDescr_M));
1159: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(fs->matDescr_M, HIPSPARSE_INDEX_BASE_ZERO));
1160: PetscCallHIPSPARSE(hipsparseSetMatType(fs->matDescr_M, HIPSPARSE_MATRIX_TYPE_GENERAL));
1162: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1163: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1164: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1165: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1166: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1167: */
1168: fillMode = HIPSPARSE_FILL_MODE_LOWER;
1169: diagType = HIPSPARSE_DIAG_TYPE_NON_UNIT;
1170: PetscCallHIPSPARSE(hipsparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_32I, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1171: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1172: PetscCallHIPSPARSE(hipsparseSpMatSetAttribute(fs->spMatDescr_L, HIPSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1174: /* ========================================================================= */
1175: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1176: /* ========================================================================= */
1177: PetscCallHIPSPARSE(hipsparseCreateCsric02Info(&fs->ic0Info_M));
1178: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
1180: PetscCallHIP(hipMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1181: PetscCallHIP(hipMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1183: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, hipsparse_scalartype));
1184: PetscCallHIPSPARSE(hipsparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, hipsparse_scalartype));
1186: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_L));
1187: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1189: PetscCallHIPSPARSE(hipsparseSpSV_createDescr(&fs->spsvDescr_Lt));
1190: PetscCallHIPSPARSE(hipsparseSpSV_bufferSize(fs->handle, HIPSPARSE_OPERATION_TRANSPOSE, &PETSC_HIPSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, hipsparse_scalartype, HIPSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1192: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1193: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1194: */
1195: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1196: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1197: fs->spsvBuffer_L = fs->factBuffer_M;
1198: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1199: } else {
1200: PetscCallHIP(hipMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1201: fs->spsvBuffer_Lt = fs->factBuffer_M;
1202: PetscCallHIP(hipMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1203: }
1205: /* ========================================================================== */
1206: /* Perform analysis of ic0 on M */
1207: /* The lower triangular part of M has the same sparsity pattern as L */
1208: /* ========================================================================== */
1209: int structural_zero;
1210: hipsparseStatus_t status;
1212: fs->policy_M = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
1213: if (m) PetscCallHIPSPARSE(hipsparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1214: if (PetscDefined(USE_DEBUG)) {
1215: /* hipsparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1216: status = hipsparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1217: PetscCheck(HIPSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1218: }
1220: /* Estimate FLOPs of the numeric factorization */
1221: {
1222: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1223: PetscInt *Ai, nzRow, nzLeft;
1224: PetscLogDouble flops = 0.0;
1226: Ai = Aseq->i;
1227: for (PetscInt i = 0; i < m; i++) {
1228: nzRow = Ai[i + 1] - Ai[i];
1229: if (nzRow > 1) {
1230: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1231: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1232: */
1233: nzLeft = (nzRow - 1) / 2;
1234: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1235: }
1236: }
1237: fs->numericFactFlops = flops;
1238: }
1239: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJHIPSPARSE_ICC0;
1240: PetscFunctionReturn(PETSC_SUCCESS);
1241: }
1243: static PetscErrorCode MatLUFactorNumeric_SeqAIJHIPSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1244: {
1245: // use_cpu_solve is a field in Mat_SeqAIJHIPSPARSE. B, a factored matrix, uses Mat_SeqAIJHIPSPARSETriFactors.
1246: Mat_SeqAIJHIPSPARSE *hipsparsestruct = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1248: PetscFunctionBegin;
1249: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
1250: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1251: B->offloadmask = PETSC_OFFLOAD_CPU;
1253: if (!hipsparsestruct->use_cpu_solve) {
1254: B->ops->solve = MatSolve_SeqAIJHIPSPARSE_LU;
1255: B->ops->solvetranspose = MatSolveTranspose_SeqAIJHIPSPARSE_LU;
1256: }
1257: B->ops->matsolve = NULL;
1258: B->ops->matsolvetranspose = NULL;
1260: /* get the triangular factors */
1261: if (!hipsparsestruct->use_cpu_solve) PetscCall(MatSeqAIJHIPSPARSEILUAnalysisAndCopyToGPU(B));
1262: PetscFunctionReturn(PETSC_SUCCESS);
1263: }
1265: static PetscErrorCode MatLUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1266: {
1267: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1269: PetscFunctionBegin;
1270: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1271: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1272: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1273: PetscFunctionReturn(PETSC_SUCCESS);
1274: }
1276: static PetscErrorCode MatILUFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1277: {
1278: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1280: PetscFunctionBegin;
1281: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1282: if (!info->factoronhost) {
1283: PetscCall(ISIdentity(isrow, &row_identity));
1284: PetscCall(ISIdentity(iscol, &col_identity));
1285: }
1286: if (!info->levels && row_identity && col_identity) PetscCall(MatILUFactorSymbolic_SeqAIJHIPSPARSE_ILU0(B, A, isrow, iscol, info));
1287: else {
1288: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1289: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1290: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJHIPSPARSE;
1291: }
1292: PetscFunctionReturn(PETSC_SUCCESS);
1293: }
1295: static PetscErrorCode MatICCFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1296: {
1297: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1299: PetscFunctionBegin;
1300: PetscBool perm_identity = PETSC_FALSE;
1301: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1302: if (!info->levels && perm_identity) PetscCall(MatICCFactorSymbolic_SeqAIJHIPSPARSE_ICC0(B, A, perm, info));
1303: else {
1304: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1305: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1306: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1307: }
1308: PetscFunctionReturn(PETSC_SUCCESS);
1309: }
1311: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1312: {
1313: Mat_SeqAIJHIPSPARSETriFactors *hipsparseTriFactors = (Mat_SeqAIJHIPSPARSETriFactors *)B->spptr;
1315: PetscFunctionBegin;
1316: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(&hipsparseTriFactors));
1317: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1318: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJHIPSPARSE;
1319: PetscFunctionReturn(PETSC_SUCCESS);
1320: }
1322: static PetscErrorCode MatFactorGetSolverType_seqaij_hipsparse(Mat A, MatSolverType *type)
1323: {
1324: PetscFunctionBegin;
1325: *type = MATSOLVERHIPSPARSE;
1326: PetscFunctionReturn(PETSC_SUCCESS);
1327: }
1329: /*MC
1330: MATSOLVERHIPSPARSE = "hipsparse" - A matrix type providing triangular solvers for sequential matrices
1331: on a single GPU of type, `MATSEQAIJHIPSPARSE`. Currently supported
1332: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1333: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1334: HipSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1335: algorithms are not recommended. This class does NOT support direct solver operations.
1337: Level: beginner
1339: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJHIPSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
1340: M*/
1342: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijhipsparse_hipsparse(Mat A, MatFactorType ftype, Mat *B)
1343: {
1344: PetscInt n = A->rmap->n;
1346: PetscFunctionBegin;
1347: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1348: PetscCall(MatSetSizes(*B, n, n, n, n));
1349: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJHIPSPARSETriFactors
1350: PetscCall(MatSetType(*B, MATSEQAIJHIPSPARSE));
1352: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1353: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1354: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1355: if (!A->boundtocpu) {
1356: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJHIPSPARSE;
1357: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJHIPSPARSE;
1358: } else {
1359: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1360: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1361: }
1362: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1363: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1364: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1365: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1366: if (!A->boundtocpu) {
1367: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJHIPSPARSE;
1368: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJHIPSPARSE;
1369: } else {
1370: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1371: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1372: }
1373: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1374: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1375: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for HIPSPARSE Matrix Types");
1377: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1378: (*B)->canuseordering = PETSC_TRUE;
1379: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_hipsparse));
1380: PetscFunctionReturn(PETSC_SUCCESS);
1381: }
1383: static PetscErrorCode MatSeqAIJHIPSPARSECopyFromGPU(Mat A)
1384: {
1385: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1386: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1387: Mat_SeqAIJHIPSPARSETriFactors *fs = (Mat_SeqAIJHIPSPARSETriFactors *)A->spptr;
1389: PetscFunctionBegin;
1390: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1391: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1392: if (A->factortype == MAT_FACTOR_NONE) {
1393: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1394: PetscCallHIP(hipMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1395: } else if (fs->csrVal) {
1396: /* We have a factorized matrix on device and are able to copy it to host */
1397: PetscCallHIP(hipMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), hipMemcpyDeviceToHost));
1398: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1399: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1400: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyFromGPU, A, 0, 0, 0));
1401: A->offloadmask = PETSC_OFFLOAD_BOTH;
1402: }
1403: PetscFunctionReturn(PETSC_SUCCESS);
1404: }
1406: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (HIP specialisation) */
1407: struct MatSeqAIJHIPSPARSE_Policy {
1408: typedef Mat_SeqAIJHIPSPARSE mat_struct_type;
1409: typedef Mat_SeqAIJHIPSPARSEMultStruct mult_struct_type;
1411: static int storage_format_csr() { return (int)MAT_HIPSPARSE_CSR; }
1412: static int storage_format_ell() { return (int)MAT_HIPSPARSE_ELL; }
1413: static int storage_format_hyb() { return (int)MAT_HIPSPARSE_HYB; }
1415: static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJHIPSPARSECopyToGPU(A); }
1416: static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJHIPSPARSECopyFromGPU(A); }
1417: static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJHIPSPARSEInvalidateTranspose(A, d); }
1418: static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, t, r, C); }
1419: static const char *mat_type_name;
1421: static PetscErrorCode Destroy(Mat A) { return MatSeqAIJHIPSPARSE_Destroy(A); }
1422: static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJHIPSPARSETriFactors_Destroy((Mat_SeqAIJHIPSPARSETriFactors **)spptr); }
1423: static const char *set_format_c;
1424: static const char *set_use_cpu_solve_c;
1425: static const char *product_seqdense_device_c;
1426: static const char *product_seqdense_c;
1427: static const char *product_self_c;
1428: static const char *seq_convert_hypre_c;
1430: static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecHIPGetArrayRead(v, a); }
1431: static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecHIPRestoreArrayRead(v, a); }
1432: static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecHIPGetArrayWrite(v, a); }
1433: static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecHIPRestoreArrayWrite(v, a); }
1434: };
1435: const char *MatSeqAIJHIPSPARSE_Policy::mat_type_name = MATSEQAIJHIPSPARSE;
1436: const char *MatSeqAIJHIPSPARSE_Policy::set_format_c = "MatHIPSPARSESetFormat_C";
1437: const char *MatSeqAIJHIPSPARSE_Policy::set_use_cpu_solve_c = "MatHIPSPARSESetUseCPUSolve_C";
1438: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C";
1439: const char *MatSeqAIJHIPSPARSE_Policy::product_seqdense_c = "MatProductSetFromOptions_seqaijhipsparse_seqdense_C";
1440: const char *MatSeqAIJHIPSPARSE_Policy::product_self_c = "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C";
1441: const char *MatSeqAIJHIPSPARSE_Policy::seq_convert_hypre_c = "MatConvert_seqaijhipsparse_hypre_C";
1443: using MatSeqAIJHIPSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::HIP, MatSeqAIJHIPSPARSE_Policy>;
1445: static PetscErrorCode MatSeqAIJGetArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1446: {
1447: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1448: }
1450: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1451: {
1452: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1453: }
1455: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1456: {
1457: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1458: }
1460: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE(Mat A, const PetscScalar *array[])
1461: {
1462: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1463: }
1465: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1466: {
1467: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1468: }
1470: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE(Mat A, PetscScalar *array[])
1471: {
1472: return MatSeqAIJHIPSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1473: }
1475: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1476: {
1477: Mat_SeqAIJHIPSPARSE *cusp;
1478: CsrMatrix *matrix;
1480: PetscFunctionBegin;
1481: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1482: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1483: cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(A->spptr);
1484: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1485: matrix = (CsrMatrix *)cusp->mat->mat;
1487: if (i) *i = matrix->row_offsets->data().get();
1488: if (j) *j = matrix->column_indices->data().get();
1489: if (a) *a = matrix->values->data().get();
1490: if (mtype) *mtype = PETSC_MEMTYPE_HIP;
1491: PetscFunctionReturn(PETSC_SUCCESS);
1492: }
1494: PETSC_INTERN PetscErrorCode MatSeqAIJHIPSPARSECopyToGPU(Mat A)
1495: {
1496: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1497: Mat_SeqAIJHIPSPARSEMultStruct *matstruct = hipsparsestruct->mat;
1498: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1499: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
1500: PetscBool both = PETSC_TRUE;
1502: PetscFunctionBegin;
1503: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1504: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1505: if (A->nonzerostate == hipsparsestruct->nonzerostate && hipsparsestruct->format == MAT_HIPSPARSE_CSR) { /* Copy values only */
1506: CsrMatrix *matrix;
1507: matrix = (CsrMatrix *)hipsparsestruct->mat->mat;
1509: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1510: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1511: matrix->values->assign(a->a, a->a + a->nz);
1512: PetscCallHIP(WaitForHIP());
1513: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1514: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1515: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_FALSE));
1516: } else {
1517: PetscInt nnz;
1518: PetscCall(PetscLogEventBegin(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1519: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&hipsparsestruct->mat, hipsparsestruct->format));
1520: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(A, PETSC_TRUE));
1521: delete hipsparsestruct->workVector;
1522: delete hipsparsestruct->rowoffsets_gpu;
1523: hipsparsestruct->workVector = NULL;
1524: hipsparsestruct->rowoffsets_gpu = NULL;
1525: try {
1526: if (a->compressedrow.use) {
1527: m = a->compressedrow.nrows;
1528: ii = a->compressedrow.i;
1529: ridx = a->compressedrow.rindex;
1530: } else {
1531: m = A->rmap->n;
1532: ii = a->i;
1533: ridx = NULL;
1534: }
1535: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1536: if (!a->a) {
1537: nnz = ii[m];
1538: both = PETSC_FALSE;
1539: } else nnz = a->nz;
1540: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
1542: /* create cusparse matrix */
1543: hipsparsestruct->nrows = m;
1544: matstruct = new Mat_SeqAIJHIPSPARSEMultStruct;
1545: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&matstruct->descr));
1546: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(matstruct->descr, HIPSPARSE_INDEX_BASE_ZERO));
1547: PetscCallHIPSPARSE(hipsparseSetMatType(matstruct->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
1549: PetscCallHIP(hipMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1550: PetscCallHIP(hipMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1551: PetscCallHIP(hipMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1552: PetscCallHIP(hipMemcpy(matstruct->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
1553: PetscCallHIP(hipMemcpy(matstruct->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
1554: PetscCallHIP(hipMemcpy(matstruct->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
1555: PetscCallHIPSPARSE(hipsparseSetPointerMode(hipsparsestruct->handle, HIPSPARSE_POINTER_MODE_DEVICE));
1557: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1558: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
1559: /* set the matrix */
1560: CsrMatrix *mat = new CsrMatrix;
1561: mat->num_rows = m;
1562: mat->num_cols = A->cmap->n;
1563: mat->num_entries = nnz;
1564: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY(m + 1));
1565: mat->row_offsets->assign(ii, ii + m + 1);
1566: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY(nnz));
1567: mat->column_indices->assign(a->j, a->j + nnz);
1569: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1570: if (a->a) mat->values->assign(a->a, a->a + nnz);
1572: /* assign the pointer */
1573: matstruct->mat = mat;
1574: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1575: PetscCallHIPSPARSE(hipsparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1576: }
1577: } else if (hipsparsestruct->format == MAT_HIPSPARSE_ELL || hipsparsestruct->format == MAT_HIPSPARSE_HYB) {
1578: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_HIPSPARSE_ELL and MAT_HIPSPARSE_HYB are not supported");
1579: }
1581: /* assign the compressed row indices */
1582: if (a->compressedrow.use) {
1583: PetscCallCXX(hipsparsestruct->workVector = new THRUSTARRAY(m));
1584: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1585: matstruct->cprowIndices->assign(ridx, ridx + m);
1586: tmp = m;
1587: } else {
1588: hipsparsestruct->workVector = NULL;
1589: matstruct->cprowIndices = NULL;
1590: tmp = 0;
1591: }
1592: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
1594: /* assign the pointer */
1595: hipsparsestruct->mat = matstruct;
1596: } catch (char *ex) {
1597: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
1598: }
1599: PetscCallHIP(WaitForHIP());
1600: PetscCall(PetscLogEventEnd(MAT_HIPSPARSECopyToGPU, A, 0, 0, 0));
1601: hipsparsestruct->nonzerostate = A->nonzerostate;
1602: }
1603: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1604: }
1605: PetscFunctionReturn(PETSC_SUCCESS);
1606: }
1608: struct VecHIPPlusEquals {
1609: template <typename Tuple>
1610: __host__ __device__ void operator()(Tuple t)
1611: {
1612: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1613: }
1614: };
1616: struct VecHIPEquals {
1617: template <typename Tuple>
1618: __host__ __device__ void operator()(Tuple t)
1619: {
1620: thrust::get<1>(t) = thrust::get<0>(t);
1621: }
1622: };
1624: struct VecHIPEqualsReverse {
1625: template <typename Tuple>
1626: __host__ __device__ void operator()(Tuple t)
1627: {
1628: thrust::get<0>(t) = thrust::get<1>(t);
1629: }
1630: };
1632: struct MatProductCtx_MatMatHipsparse {
1633: PetscBool cisdense;
1634: PetscScalar *Bt;
1635: Mat X;
1636: PetscBool reusesym; /* Hipsparse does not have split symbolic and numeric phases for sparse matmat operations */
1637: PetscLogDouble flops;
1638: CsrMatrix *Bcsr;
1639: hipsparseSpMatDescr_t matSpBDescr;
1640: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1641: hipsparseDnMatDescr_t matBDescr;
1642: hipsparseDnMatDescr_t matCDescr;
1643: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1644: size_t mmBufferSize;
1645: void *mmBuffer, *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1646: hipsparseSpGEMMDescr_t spgemmDesc;
1647: };
1649: static PetscErrorCode MatProductCtxDestroy_MatMatHipsparse(PetscCtxRt data)
1650: {
1651: MatProductCtx_MatMatHipsparse *mmdata = *(MatProductCtx_MatMatHipsparse **)data;
1653: PetscFunctionBegin;
1654: PetscCallHIP(hipFree(mmdata->Bt));
1655: delete mmdata->Bcsr;
1656: if (mmdata->matSpBDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mmdata->matSpBDescr));
1657: if (mmdata->matBDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
1658: if (mmdata->matCDescr) PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
1659: if (mmdata->spgemmDesc) PetscCallHIPSPARSE(hipsparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1660: PetscCallHIP(hipFree(mmdata->mmBuffer));
1661: PetscCallHIP(hipFree(mmdata->mmBuffer2));
1662: PetscCall(MatDestroy(&mmdata->X));
1663: PetscCall(PetscFree(*(void **)data));
1664: PetscFunctionReturn(PETSC_SUCCESS);
1665: }
1667: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
1668: {
1669: Mat_Product *product = C->product;
1670: Mat A, B;
1671: PetscInt m, n, blda, clda;
1672: PetscBool flg, biship;
1673: Mat_SeqAIJHIPSPARSE *cusp;
1674: hipsparseOperation_t opA;
1675: const PetscScalar *barray;
1676: PetscScalar *carray;
1677: MatProductCtx_MatMatHipsparse *mmdata;
1678: Mat_SeqAIJHIPSPARSEMultStruct *mat;
1679: CsrMatrix *csrmat;
1681: PetscFunctionBegin;
1682: MatCheckProduct(C, 1);
1683: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1684: mmdata = (MatProductCtx_MatMatHipsparse *)product->data;
1685: A = product->A;
1686: B = product->B;
1687: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1688: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1689: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1690: Instead of silently accepting the wrong answer, I prefer to raise the error */
1691: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1692: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1693: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1694: switch (product->type) {
1695: case MATPRODUCT_AB:
1696: case MATPRODUCT_PtAP:
1697: mat = cusp->mat;
1698: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1699: m = A->rmap->n;
1700: n = B->cmap->n;
1701: break;
1702: case MATPRODUCT_AtB:
1703: if (!A->form_explicit_transpose) {
1704: mat = cusp->mat;
1705: opA = HIPSPARSE_OPERATION_TRANSPOSE;
1706: } else {
1707: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
1708: mat = cusp->matTranspose;
1709: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1710: }
1711: m = A->cmap->n;
1712: n = B->cmap->n;
1713: break;
1714: case MATPRODUCT_ABt:
1715: case MATPRODUCT_RARt:
1716: mat = cusp->mat;
1717: opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
1718: m = A->rmap->n;
1719: n = B->rmap->n;
1720: break;
1721: default:
1722: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1723: }
1724: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
1725: csrmat = (CsrMatrix *)mat->mat;
1726: /* if the user passed a CPU matrix, copy the data to the GPU */
1727: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSEHIP, &biship));
1728: if (!biship) PetscCall(MatConvert(B, MATSEQDENSEHIP, MAT_INPLACE_MATRIX, &B));
1729: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
1730: PetscCall(MatDenseGetLDA(B, &blda));
1731: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1732: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1733: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1734: } else {
1735: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1736: PetscCall(MatDenseGetLDA(C, &clda));
1737: }
1739: PetscCall(PetscLogGpuTimeBegin());
1740: hipsparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE;
1741: /* (re)allocate mmBuffer if not initialized or LDAs are different */
1742: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1743: size_t mmBufferSize;
1744: if (mmdata->initialized && mmdata->Blda != blda) {
1745: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matBDescr));
1746: mmdata->matBDescr = NULL;
1747: }
1748: if (!mmdata->matBDescr) {
1749: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
1750: mmdata->Blda = blda;
1751: }
1752: if (mmdata->initialized && mmdata->Clda != clda) {
1753: PetscCallHIPSPARSE(hipsparseDestroyDnMat(mmdata->matCDescr));
1754: mmdata->matCDescr = NULL;
1755: }
1756: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1757: PetscCallHIPSPARSE(hipsparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, hipsparse_scalartype, HIPSPARSE_ORDER_COL));
1758: mmdata->Clda = clda;
1759: }
1760: if (!mat->matDescr) {
1761: PetscCallHIPSPARSE(hipsparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
1762: }
1763: PetscCallHIPSPARSE(hipsparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, &mmBufferSize));
1764: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1765: PetscCallHIP(hipFree(mmdata->mmBuffer));
1766: PetscCallHIP(hipMalloc(&mmdata->mmBuffer, mmBufferSize));
1767: mmdata->mmBufferSize = mmBufferSize;
1768: }
1769: mmdata->initialized = PETSC_TRUE;
1770: } else {
1771: /* to be safe, always update pointers of the mats */
1772: PetscCallHIPSPARSE(hipsparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
1773: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1774: PetscCallHIPSPARSE(hipsparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1775: }
1777: /* do hipsparseSpMM, which supports transpose on B */
1778: PetscCallHIPSPARSE(hipsparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, hipsparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1780: PetscCall(PetscLogGpuTimeEnd());
1781: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1782: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1783: if (product->type == MATPRODUCT_RARt) {
1784: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1785: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1786: } else if (product->type == MATPRODUCT_PtAP) {
1787: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1788: PetscCall(MatMatMultNumeric_SeqDenseHIP_SeqDenseHIP_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1789: } else PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1790: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1791: if (!biship) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1792: PetscFunctionReturn(PETSC_SUCCESS);
1793: }
1795: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP(Mat C)
1796: {
1797: Mat_Product *product = C->product;
1798: Mat A, B;
1799: PetscInt m, n;
1800: PetscBool cisdense, flg;
1801: MatProductCtx_MatMatHipsparse *mmdata;
1802: Mat_SeqAIJHIPSPARSE *cusp;
1804: PetscFunctionBegin;
1805: MatCheckProduct(C, 1);
1806: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1807: A = product->A;
1808: B = product->B;
1809: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1810: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1811: cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1812: PetscCheck(cusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1813: switch (product->type) {
1814: case MATPRODUCT_AB:
1815: m = A->rmap->n;
1816: n = B->cmap->n;
1817: break;
1818: case MATPRODUCT_AtB:
1819: m = A->cmap->n;
1820: n = B->cmap->n;
1821: break;
1822: case MATPRODUCT_ABt:
1823: m = A->rmap->n;
1824: n = B->rmap->n;
1825: break;
1826: case MATPRODUCT_PtAP:
1827: m = B->cmap->n;
1828: n = B->cmap->n;
1829: break;
1830: case MATPRODUCT_RARt:
1831: m = B->rmap->n;
1832: n = B->rmap->n;
1833: break;
1834: default:
1835: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1836: }
1837: PetscCall(MatSetSizes(C, m, n, m, n));
1838: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1839: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1840: PetscCall(MatSetType(C, MATSEQDENSEHIP));
1842: /* product data */
1843: PetscCall(PetscNew(&mmdata));
1844: mmdata->cisdense = cisdense;
1845: /* for these products we need intermediate storage */
1846: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1847: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1848: PetscCall(MatSetType(mmdata->X, MATSEQDENSEHIP));
1849: /* do not preallocate, since the first call to MatDenseHIPGetArray will preallocate on the GPU for us */
1850: if (product->type == MATPRODUCT_RARt) PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1851: else PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1852: }
1853: C->product->data = mmdata;
1854: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
1855: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqDENSEHIP;
1856: PetscFunctionReturn(PETSC_SUCCESS);
1857: }
1859: static PetscErrorCode MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
1860: {
1861: Mat_Product *product = C->product;
1862: Mat A, B;
1863: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
1864: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1865: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
1866: CsrMatrix *Acsr, *Bcsr, *Ccsr;
1867: PetscBool flg;
1868: MatProductType ptype;
1869: MatProductCtx_MatMatHipsparse *mmdata;
1870: hipsparseSpMatDescr_t BmatSpDescr;
1871: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* hipSPARSE spgemm doesn't support transpose yet */
1873: PetscFunctionBegin;
1874: MatCheckProduct(C, 1);
1875: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1876: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJHIPSPARSE, &flg));
1877: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1878: mmdata = (MatProductCtx_MatMatHipsparse *)C->product->data;
1879: A = product->A;
1880: B = product->B;
1881: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1882: mmdata->reusesym = PETSC_FALSE;
1883: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
1884: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1885: Cmat = Ccusp->mat;
1886: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1887: Ccsr = (CsrMatrix *)Cmat->mat;
1888: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1889: goto finalize;
1890: }
1891: if (!c->nz) goto finalize;
1892: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1893: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1894: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
1895: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1896: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1897: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a HIPSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1898: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
1899: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
1900: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
1901: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1902: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1903: PetscCheck(Ccusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
1904: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
1905: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
1907: ptype = product->type;
1908: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1909: ptype = MATPRODUCT_AB;
1910: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1911: }
1912: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1913: ptype = MATPRODUCT_AB;
1914: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1915: }
1916: switch (ptype) {
1917: case MATPRODUCT_AB:
1918: Amat = Acusp->mat;
1919: Bmat = Bcusp->mat;
1920: break;
1921: case MATPRODUCT_AtB:
1922: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
1923: Amat = Acusp->matTranspose;
1924: Bmat = Bcusp->mat;
1925: break;
1926: case MATPRODUCT_ABt:
1927: Amat = Acusp->mat;
1928: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
1929: Bmat = Bcusp->matTranspose;
1930: break;
1931: default:
1932: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1933: }
1934: Cmat = Ccusp->mat;
1935: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1936: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1937: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1938: Acsr = (CsrMatrix *)Amat->mat;
1939: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1940: Ccsr = (CsrMatrix *)Cmat->mat;
1941: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
1942: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
1943: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1944: PetscCall(PetscLogGpuTimeBegin());
1945: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
1946: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
1947: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
1948: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
1949: PetscCall(PetscLogGpuFlops(mmdata->flops));
1950: PetscCallHIP(WaitForHIP());
1951: PetscCall(PetscLogGpuTimeEnd());
1952: C->offloadmask = PETSC_OFFLOAD_GPU;
1953: finalize:
1954: /* shorter version of MatAssemblyEnd_SeqAIJ */
1955: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
1956: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
1957: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
1958: c->reallocs = 0;
1959: C->info.mallocs += 0;
1960: C->info.nz_unneeded = 0;
1961: C->assembled = C->was_assembled = PETSC_TRUE;
1962: C->num_ass++;
1963: PetscFunctionReturn(PETSC_SUCCESS);
1964: }
1966: static PetscErrorCode MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE(Mat C)
1967: {
1968: Mat_Product *product = C->product;
1969: Mat A, B;
1970: Mat_SeqAIJHIPSPARSE *Acusp, *Bcusp, *Ccusp;
1971: Mat_SeqAIJ *a, *b, *c;
1972: Mat_SeqAIJHIPSPARSEMultStruct *Amat, *Bmat, *Cmat;
1973: CsrMatrix *Acsr, *Bcsr, *Ccsr;
1974: PetscInt i, j, m, n, k;
1975: PetscBool flg;
1976: MatProductType ptype;
1977: MatProductCtx_MatMatHipsparse *mmdata;
1978: PetscLogDouble flops;
1979: PetscBool biscompressed, ciscompressed;
1980: int64_t C_num_rows1, C_num_cols1, C_nnz1;
1981: hipsparseSpMatDescr_t BmatSpDescr;
1982: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE, opB = HIPSPARSE_OPERATION_NON_TRANSPOSE; /* HIPSPARSE spgemm doesn't support transpose yet */
1983: size_t bufSize2;
1985: PetscFunctionBegin;
1986: MatCheckProduct(C, 1);
1987: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1988: A = product->A;
1989: B = product->B;
1990: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJHIPSPARSE, &flg));
1991: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1992: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJHIPSPARSE, &flg));
1993: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1994: a = (Mat_SeqAIJ *)A->data;
1995: b = (Mat_SeqAIJ *)B->data;
1996: /* product data */
1997: PetscCall(PetscNew(&mmdata));
1998: C->product->data = mmdata;
1999: C->product->destroy = MatProductCtxDestroy_MatMatHipsparse;
2001: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2002: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
2003: Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr; /* Access spptr after MatSeqAIJHIPSPARSECopyToGPU, not before */
2004: Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr;
2005: PetscCheck(Acusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2006: PetscCheck(Bcusp->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_HIPSPARSE_CSR format");
2008: ptype = product->type;
2009: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2010: ptype = MATPRODUCT_AB;
2011: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2012: }
2013: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2014: ptype = MATPRODUCT_AB;
2015: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2016: }
2017: biscompressed = PETSC_FALSE;
2018: ciscompressed = PETSC_FALSE;
2019: switch (ptype) {
2020: case MATPRODUCT_AB:
2021: m = A->rmap->n;
2022: n = B->cmap->n;
2023: k = A->cmap->n;
2024: Amat = Acusp->mat;
2025: Bmat = Bcusp->mat;
2026: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2027: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2028: break;
2029: case MATPRODUCT_AtB:
2030: m = A->cmap->n;
2031: n = B->cmap->n;
2032: k = A->rmap->n;
2033: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2034: Amat = Acusp->matTranspose;
2035: Bmat = Bcusp->mat;
2036: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2037: break;
2038: case MATPRODUCT_ABt:
2039: m = A->rmap->n;
2040: n = B->rmap->n;
2041: k = A->cmap->n;
2042: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
2043: Amat = Acusp->mat;
2044: Bmat = Bcusp->matTranspose;
2045: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2046: break;
2047: default:
2048: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2049: }
2051: /* create hipsparse matrix */
2052: PetscCall(MatSetSizes(C, m, n, m, n));
2053: PetscCall(MatSetType(C, MATSEQAIJHIPSPARSE));
2054: c = (Mat_SeqAIJ *)C->data;
2055: Ccusp = (Mat_SeqAIJHIPSPARSE *)C->spptr;
2056: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
2057: Ccsr = new CsrMatrix;
2059: c->compressedrow.use = ciscompressed;
2060: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2061: c->compressedrow.nrows = a->compressedrow.nrows;
2062: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2063: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2064: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2065: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2066: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2067: } else {
2068: c->compressedrow.nrows = 0;
2069: c->compressedrow.i = NULL;
2070: c->compressedrow.rindex = NULL;
2071: Ccusp->workVector = NULL;
2072: Cmat->cprowIndices = NULL;
2073: }
2074: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2075: Ccusp->mat = Cmat;
2076: Ccusp->mat->mat = Ccsr;
2077: Ccsr->num_rows = Ccusp->nrows;
2078: Ccsr->num_cols = n;
2079: Ccsr->row_offsets = new THRUSTINTARRAY(Ccusp->nrows + 1);
2080: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
2081: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
2082: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
2083: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2084: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2085: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2086: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2087: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
2088: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
2089: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* hipsparse raise errors in different calls when matrices have zero rows/columns! */
2090: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2091: c->nz = 0;
2092: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2093: Ccsr->values = new THRUSTARRAY(c->nz);
2094: goto finalizesym;
2095: }
2097: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2098: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2099: Acsr = (CsrMatrix *)Amat->mat;
2100: if (!biscompressed) {
2101: Bcsr = (CsrMatrix *)Bmat->mat;
2102: BmatSpDescr = Bmat->matDescr;
2103: } else { /* we need to use row offsets for the full matrix */
2104: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2105: Bcsr = new CsrMatrix;
2106: Bcsr->num_rows = B->rmap->n;
2107: Bcsr->num_cols = cBcsr->num_cols;
2108: Bcsr->num_entries = cBcsr->num_entries;
2109: Bcsr->column_indices = cBcsr->column_indices;
2110: Bcsr->values = cBcsr->values;
2111: if (!Bcusp->rowoffsets_gpu) {
2112: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
2113: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2114: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2115: }
2116: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2117: mmdata->Bcsr = Bcsr;
2118: if (Bcsr->num_rows && Bcsr->num_cols) {
2119: PetscCallHIPSPARSE(hipsparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2120: }
2121: BmatSpDescr = mmdata->matSpBDescr;
2122: }
2123: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2124: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2125: /* precompute flops count */
2126: if (ptype == MATPRODUCT_AB) {
2127: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2128: const PetscInt st = a->i[i];
2129: const PetscInt en = a->i[i + 1];
2130: for (j = st; j < en; j++) {
2131: const PetscInt brow = a->j[j];
2132: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2133: }
2134: }
2135: } else if (ptype == MATPRODUCT_AtB) {
2136: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2137: const PetscInt anzi = a->i[i + 1] - a->i[i];
2138: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2139: flops += (2. * anzi) * bnzi;
2140: }
2141: } else flops = 0.; /* TODO */
2143: mmdata->flops = flops;
2144: PetscCall(PetscLogGpuTimeBegin());
2146: PetscCallHIPSPARSE(hipsparseSetPointerMode(Ccusp->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2147: // cuda-12.2 requires non-null csrRowOffsets
2148: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
2149: PetscCallHIPSPARSE(hipsparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2150: // Note that cusparseSpGEMMreuse is deprecated in CUDA 13.2.1
2152: /* ask bufferSize bytes for external memory */
2153: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2154: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2155: /* inspect the matrices A and B to understand the memory requirement for the next step */
2156: PetscCallHIPSPARSE(hipsparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2157: /* ask bufferSize again bytes for external memory */
2158: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2159: /* The HIPSPARSE documentation is not clear, nor the API
2160: We need both buffers to perform the operations properly!
2161: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2162: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2163: is stored in the descriptor! What a messy API... */
2164: PetscCallHIP(hipMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2165: /* compute the intermediate product of A * B */
2166: PetscCallHIPSPARSE(hipsparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2167: /* get matrix C non-zero entries C_nnz1 */
2168: PetscCallHIPSPARSE(hipsparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2169: PetscCall(PetscIntCast(C_nnz1, &c->nz));
2170: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2171: mmdata->mmBufferSize / 1024));
2172: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2173: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2174: Ccsr->values = new THRUSTARRAY(c->nz);
2175: PetscCallHIP(hipPeekAtLastError()); /* catch out of memory errors */
2176: // hipSparse errors with null pointers even with nz = 0
2177: if (c->nz) PetscCallHIPSPARSE(hipsparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2178: PetscCallHIPSPARSE(hipsparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, hipsparse_scalartype, HIPSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2179: PetscCall(PetscLogGpuFlops(mmdata->flops));
2180: PetscCall(PetscLogGpuTimeEnd());
2181: finalizesym:
2182: c->free_a = PETSC_TRUE;
2183: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2184: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2185: c->free_ij = PETSC_TRUE;
2187: PetscInt *d_i = c->i;
2188: if (ciscompressed) d_i = c->compressedrow.i;
2189: PetscCallHIP(hipMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2190: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
2191: if (ciscompressed) { /* need to expand host row offsets */
2192: PetscInt r = 0;
2193: c->i[0] = 0;
2194: for (k = 0; k < c->compressedrow.nrows; k++) {
2195: const PetscInt next = c->compressedrow.rindex[k];
2196: const PetscInt old = c->compressedrow.i[k];
2197: for (; r < next; r++) c->i[r + 1] = old;
2198: }
2199: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2200: }
2201: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2202: PetscCall(PetscMalloc1(m, &c->ilen));
2203: PetscCall(PetscMalloc1(m, &c->imax));
2204: c->maxnz = c->nz;
2205: c->nonzerorowcnt = 0;
2206: c->rmax = 0;
2207: for (k = 0; k < m; k++) {
2208: const PetscInt nn = c->i[k + 1] - c->i[k];
2209: c->ilen[k] = c->imax[k] = nn;
2210: c->nonzerorowcnt += (PetscInt)!!nn;
2211: c->rmax = PetscMax(c->rmax, nn);
2212: }
2213: PetscCall(PetscMalloc1(c->nz, &c->a));
2214: Ccsr->num_entries = c->nz;
2216: C->nonzerostate++;
2217: PetscCall(PetscLayoutSetUp(C->rmap));
2218: PetscCall(PetscLayoutSetUp(C->cmap));
2219: Ccusp->nonzerostate = C->nonzerostate;
2220: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2221: C->preallocated = PETSC_TRUE;
2222: C->assembled = PETSC_FALSE;
2223: C->was_assembled = PETSC_FALSE;
2224: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2225: mmdata->reusesym = PETSC_TRUE;
2226: C->offloadmask = PETSC_OFFLOAD_GPU;
2227: }
2228: C->ops->productnumeric = MatProductNumeric_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2229: PetscFunctionReturn(PETSC_SUCCESS);
2230: }
2232: /* handles sparse or dense B */
2233: static PetscErrorCode MatProductSetFromOptions_SeqAIJHIPSPARSE(Mat mat)
2234: {
2235: Mat_Product *product = mat->product;
2236: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2238: PetscFunctionBegin;
2239: MatCheckProduct(mat, 1);
2240: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2241: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJHIPSPARSE, &Biscusp));
2242: if (product->type == MATPRODUCT_ABC) {
2243: Ciscusp = PETSC_FALSE;
2244: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJHIPSPARSE, &Ciscusp));
2245: }
2246: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2247: PetscBool usecpu = PETSC_FALSE;
2248: switch (product->type) {
2249: case MATPRODUCT_AB:
2250: if (product->api_user) {
2251: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2252: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2253: PetscOptionsEnd();
2254: } else {
2255: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2256: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2257: PetscOptionsEnd();
2258: }
2259: break;
2260: case MATPRODUCT_AtB:
2261: if (product->api_user) {
2262: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2263: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2264: PetscOptionsEnd();
2265: } else {
2266: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2267: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2268: PetscOptionsEnd();
2269: }
2270: break;
2271: case MATPRODUCT_PtAP:
2272: if (product->api_user) {
2273: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2274: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2275: PetscOptionsEnd();
2276: } else {
2277: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2278: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2279: PetscOptionsEnd();
2280: }
2281: break;
2282: case MATPRODUCT_RARt:
2283: if (product->api_user) {
2284: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2285: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2286: PetscOptionsEnd();
2287: } else {
2288: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2289: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2290: PetscOptionsEnd();
2291: }
2292: break;
2293: case MATPRODUCT_ABC:
2294: if (product->api_user) {
2295: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2296: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2297: PetscOptionsEnd();
2298: } else {
2299: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2300: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2301: PetscOptionsEnd();
2302: }
2303: break;
2304: default:
2305: break;
2306: }
2307: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2308: }
2309: /* dispatch */
2310: if (isdense) {
2311: switch (product->type) {
2312: case MATPRODUCT_AB:
2313: case MATPRODUCT_AtB:
2314: case MATPRODUCT_ABt:
2315: case MATPRODUCT_PtAP:
2316: case MATPRODUCT_RARt:
2317: if (product->A->boundtocpu) PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2318: else mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqDENSEHIP;
2319: break;
2320: case MATPRODUCT_ABC:
2321: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2322: break;
2323: default:
2324: break;
2325: }
2326: } else if (Biscusp && Ciscusp) {
2327: switch (product->type) {
2328: case MATPRODUCT_AB:
2329: case MATPRODUCT_AtB:
2330: case MATPRODUCT_ABt:
2331: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJHIPSPARSE_SeqAIJHIPSPARSE;
2332: break;
2333: case MATPRODUCT_PtAP:
2334: case MATPRODUCT_RARt:
2335: case MATPRODUCT_ABC:
2336: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2337: break;
2338: default:
2339: break;
2340: }
2341: } else PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); /* fallback for AIJ */
2342: PetscFunctionReturn(PETSC_SUCCESS);
2343: }
2345: static PetscErrorCode MatMult_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2346: {
2347: PetscFunctionBegin;
2348: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2349: PetscFunctionReturn(PETSC_SUCCESS);
2350: }
2352: static PetscErrorCode MatMultAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2353: {
2354: PetscFunctionBegin;
2355: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2356: PetscFunctionReturn(PETSC_SUCCESS);
2357: }
2359: static PetscErrorCode MatMultHermitianTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2360: {
2361: PetscFunctionBegin;
2362: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2363: PetscFunctionReturn(PETSC_SUCCESS);
2364: }
2366: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2367: {
2368: PetscFunctionBegin;
2369: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2370: PetscFunctionReturn(PETSC_SUCCESS);
2371: }
2373: static PetscErrorCode MatMultTranspose_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
2374: {
2375: PetscFunctionBegin;
2376: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2377: PetscFunctionReturn(PETSC_SUCCESS);
2378: }
2380: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2381: {
2382: int i = blockIdx.x * blockDim.x + threadIdx.x;
2383: if (i < n) y[idx[i]] += x[i];
2384: }
2386: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2387: static PetscErrorCode MatMultAddKernel_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2388: {
2389: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2390: Mat_SeqAIJHIPSPARSE *hipsparsestruct = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2391: Mat_SeqAIJHIPSPARSEMultStruct *matstruct;
2392: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
2393: hipsparseOperation_t opA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
2394: PetscBool compressed;
2395: PetscInt nx, ny;
2397: PetscFunctionBegin;
2398: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2399: if (!a->nz) {
2400: if (yy) PetscCall(VecSeq_HIP::Copy(yy, zz));
2401: else PetscCall(VecSeq_HIP::Set(zz, 0));
2402: PetscFunctionReturn(PETSC_SUCCESS);
2403: }
2404: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2405: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
2406: if (!trans) {
2407: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2408: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJHIPSPARSE does not have a 'mat' (need to fix)");
2409: } else {
2410: if (herm || !A->form_explicit_transpose) {
2411: opA = herm ? HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE : HIPSPARSE_OPERATION_TRANSPOSE;
2412: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->mat;
2413: } else {
2414: if (!hipsparsestruct->matTranspose) PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
2415: matstruct = (Mat_SeqAIJHIPSPARSEMultStruct *)hipsparsestruct->matTranspose;
2416: }
2417: }
2418: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2419: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2420: try {
2421: PetscCall(VecHIPGetArrayRead(xx, (const PetscScalar **)&xarray));
2422: if (yy == zz) PetscCall(VecHIPGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2423: else PetscCall(VecHIPGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
2425: PetscCall(PetscLogGpuTimeBegin());
2426: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
2427: /* z = A x + beta y.
2428: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2429: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2430: */
2431: xptr = xarray;
2432: dptr = compressed ? hipsparsestruct->workVector->data().get() : zarray;
2433: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2434: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2435: allocated to accommodate different uses. So we get the length info directly from mat.
2436: */
2437: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2438: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2439: nx = mat->num_cols;
2440: ny = mat->num_rows;
2441: }
2442: } else {
2443: /* z = A^T x + beta y
2444: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2445: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2446: */
2447: xptr = compressed ? hipsparsestruct->workVector->data().get() : xarray;
2448: dptr = zarray;
2449: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2450: if (compressed) { /* Scatter x to work vector */
2451: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2452: thrust::for_each(
2453: #if PetscDefined(HAVE_THRUST_ASYNC)
2454: thrust::hip::par.on(PetscDefaultHipStream),
2455: #endif
2456: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2457: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecHIPEqualsReverse());
2458: }
2459: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2460: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2461: nx = mat->num_rows;
2462: ny = mat->num_cols;
2463: }
2464: }
2465: /* csr_spmv does y = alpha op(A) x + beta y */
2466: if (hipsparsestruct->format == MAT_HIPSPARSE_CSR) {
2467: #if PETSC_PKG_HIP_VERSION_GE(5, 1, 0) && !(PETSC_PKG_HIP_VERSION_GT(6, 4, 3) && PETSC_PKG_HIP_VERSION_LE(7, 2, 0))
2468: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "hipSPARSE API on hipsparseOperation_t has changed and PETSc has not been updated accordingly");
2469: if (!matstruct->hipSpMV[opA].initialized) { /* built on demand */
2470: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecXDescr, nx, xptr, hipsparse_scalartype));
2471: PetscCallHIPSPARSE(hipsparseCreateDnVec(&matstruct->hipSpMV[opA].vecYDescr, ny, dptr, hipsparse_scalartype));
2472: PetscCallHIPSPARSE(hipsparseSpMV_bufferSize(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg,
2473: &matstruct->hipSpMV[opA].spmvBufferSize));
2474: PetscCallHIP(hipMalloc(&matstruct->hipSpMV[opA].spmvBuffer, matstruct->hipSpMV[opA].spmvBufferSize));
2475: matstruct->hipSpMV[opA].initialized = PETSC_TRUE;
2476: } else {
2477: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2478: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecXDescr, xptr));
2479: PetscCallHIPSPARSE(hipsparseDnVecSetValues(matstruct->hipSpMV[opA].vecYDescr, dptr));
2480: }
2481: PetscCallHIPSPARSE(hipsparseSpMV(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJHIPSPARSECopyToGPU() or MatSeqAIJHIPSPARSEFormExplicitTranspose() */
2482: matstruct->hipSpMV[opA].vecXDescr, beta, matstruct->hipSpMV[opA].vecYDescr, hipsparse_scalartype, hipsparsestruct->spmvAlg, matstruct->hipSpMV[opA].spmvBuffer));
2483: #else
2484: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2485: nx = mat->num_rows; /* nx,ny are set before the #if block, set them again to avoid set-but-not-used warning */
2486: ny = mat->num_cols;
2487: PetscCallHIPSPARSE(hipsparse_csr_spmv(hipsparsestruct->handle, opA, nx, ny, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
2488: #endif
2489: } else {
2490: if (hipsparsestruct->nrows) {
2491: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)matstruct->mat;
2492: PetscCallHIPSPARSE(hipsparse_hyb_spmv(hipsparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
2493: }
2494: }
2495: PetscCall(PetscLogGpuTimeEnd());
2497: if (opA == HIPSPARSE_OPERATION_NON_TRANSPOSE) {
2498: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2499: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2500: PetscCall(VecSeq_HIP::Copy(yy, zz)); /* zz = yy */
2501: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2502: PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
2503: }
2504: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2505: PetscCall(VecSeq_HIP::Set(zz, 0));
2506: }
2508: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2509: if (compressed) {
2510: PetscCall(PetscLogGpuTimeBegin());
2511: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
2512: and in the destructor of the scope, it will call hipStreamSynchronize() on this stream. One has to store all events to
2513: prevent that. So I just add a ScatterAdd kernel.
2514: */
2515: #if 0
2516: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2517: thrust::async::for_each(thrust::hip::par.on(hipsparsestruct->stream),
2518: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2519: thrust::make_zip_iterator(thrust::make_tuple(hipsparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2520: VecHIPPlusEquals());
2521: #else
2522: PetscInt n = matstruct->cprowIndices->size();
2523: hipLaunchKernelGGL(ScatterAdd, dim3((n + 255) / 256), dim3(256), 0, PetscDefaultHipStream, n, matstruct->cprowIndices->data().get(), hipsparsestruct->workVector->data().get(), zarray);
2524: #endif
2525: PetscCall(PetscLogGpuTimeEnd());
2526: }
2527: } else {
2528: if (yy && yy != zz) PetscCall(VecSeq_HIP::AXPY(zz, 1.0, yy)); /* zz += yy */
2529: }
2530: PetscCall(VecHIPRestoreArrayRead(xx, (const PetscScalar **)&xarray));
2531: if (yy == zz) PetscCall(VecHIPRestoreArray(zz, &zarray));
2532: else PetscCall(VecHIPRestoreArrayWrite(zz, &zarray));
2533: } catch (char *ex) {
2534: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "HIPSPARSE error: %s", ex);
2535: }
2536: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2537: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2538: PetscFunctionReturn(PETSC_SUCCESS);
2539: }
2541: static PetscErrorCode MatMultTransposeAdd_SeqAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2542: {
2543: PetscFunctionBegin;
2544: PetscCall(MatMultAddKernel_SeqAIJHIPSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2545: PetscFunctionReturn(PETSC_SUCCESS);
2546: }
2548: static PetscErrorCode MatAssemblyEnd_SeqAIJHIPSPARSE(Mat A, MatAssemblyType mode)
2549: {
2550: PetscFunctionBegin;
2551: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AssemblyEnd(A, mode));
2552: PetscFunctionReturn(PETSC_SUCCESS);
2553: }
2555: /*@
2556: MatCreateSeqAIJHIPSPARSE - Creates a sparse matrix in `MATAIJHIPSPARSE` (compressed row) format.
2557: This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for calculations.
2559: Collective
2561: Input Parameters:
2562: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2563: . m - number of rows
2564: . n - number of columns
2565: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is set
2566: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
2568: Output Parameter:
2569: . A - the matrix
2571: Level: intermediate
2573: Notes:
2574: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2575: `MatXXXXSetPreallocation()` paradgm instead of this routine directly.
2576: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation`]
2578: The AIJ format (compressed row storage), is fully compatible with standard Fortran
2579: storage. That is, the stored row and column indices can begin at
2580: either one (as in Fortran) or zero.
2582: Specify the preallocated storage with either `nz` or `nnz` (not both).
2583: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2584: allocation.
2586: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATSEQAIJHIPSPARSE`, `MATAIJHIPSPARSE`
2587: @*/
2588: PetscErrorCode MatCreateSeqAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2589: {
2590: return MatSeqAIJHIPSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2591: }
2593: static PetscErrorCode MatDestroy_SeqAIJHIPSPARSE(Mat A)
2594: {
2595: return MatSeqAIJHIPSPARSE_CUPM_t::Destroy(A);
2596: }
2598: static PetscErrorCode MatDuplicate_SeqAIJHIPSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2599: {
2600: PetscFunctionBegin;
2601: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2602: PetscFunctionReturn(PETSC_SUCCESS);
2603: }
2605: static PetscErrorCode MatAXPY_SeqAIJHIPSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2606: {
2607: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2608: Mat_SeqAIJHIPSPARSE *cy;
2609: Mat_SeqAIJHIPSPARSE *cx;
2610: CsrMatrix *csry, *csrx;
2612: PetscFunctionBegin;
2613: cy = (Mat_SeqAIJHIPSPARSE *)Y->spptr;
2614: cx = (Mat_SeqAIJHIPSPARSE *)X->spptr;
2615: if (X->ops->axpy != Y->ops->axpy) {
2616: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2617: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2618: PetscFunctionReturn(PETSC_SUCCESS);
2619: }
2620: /* if we are here, it means both matrices are bound to GPU */
2621: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(Y));
2622: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(X));
2623: PetscCheck(cy->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
2624: PetscCheck(cx->format == MAT_HIPSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_HIPSPARSE_CSR supported");
2625: csry = (CsrMatrix *)cy->mat->mat;
2626: csrx = (CsrMatrix *)cx->mat->mat;
2627: /* see if we can turn this into a cublas axpy */
2628: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2629: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2630: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2631: if (eq) str = SAME_NONZERO_PATTERN;
2632: }
2633: /* spgeam is buggy with one column */
2634: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
2636: #if !defined(PETSC_USE_64BIT_INDICES) // hipsparseScsrgeam2 etc. do not support 64bit indices
2637: if (str == SUBSET_NONZERO_PATTERN) {
2638: PetscScalar *ay, b = 1.0;
2639: const PetscScalar *ax;
2640: size_t bufferSize;
2641: void *buffer;
2643: PetscCall(MatSeqAIJHIPSPARSEGetArrayRead(X, &ax));
2644: PetscCall(MatSeqAIJHIPSPARSEGetArray(Y, &ay));
2645: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_HOST));
2646: PetscCallHIPSPARSE(hipsparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2647: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2648: PetscCallHIP(hipMalloc(&buffer, bufferSize));
2649: PetscCall(PetscLogGpuTimeBegin());
2650: PetscCallHIPSPARSE(hipsparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2651: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2652: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2653: PetscCall(PetscLogGpuTimeEnd());
2654: PetscCallHIP(hipFree(buffer));
2656: PetscCallHIPSPARSE(hipsparseSetPointerMode(cy->handle, HIPSPARSE_POINTER_MODE_DEVICE));
2657: PetscCall(MatSeqAIJHIPSPARSERestoreArrayRead(X, &ax));
2658: PetscCall(MatSeqAIJHIPSPARSERestoreArray(Y, &ay));
2659: } else
2660: #endif
2661: if (str == SAME_NONZERO_PATTERN) {
2662: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2663: } else {
2664: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2665: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2666: }
2667: PetscFunctionReturn(PETSC_SUCCESS);
2668: }
2670: static PetscErrorCode MatScale_SeqAIJHIPSPARSE(Mat Y, PetscScalar a)
2671: {
2672: PetscFunctionBegin;
2673: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::Scale(Y, a));
2674: PetscFunctionReturn(PETSC_SUCCESS);
2675: }
2677: static PetscErrorCode MatGetDiagonal_SeqAIJHIPSPARSE(Mat A, Vec diag)
2678: {
2679: PetscFunctionBegin;
2680: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetDiagonal(A, diag));
2681: PetscFunctionReturn(PETSC_SUCCESS);
2682: }
2684: static PetscErrorCode MatDiagonalScale_SeqAIJHIPSPARSE(Mat A, Vec ll, Vec rr)
2685: {
2686: PetscFunctionBegin;
2687: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2688: PetscFunctionReturn(PETSC_SUCCESS);
2689: }
2691: static PetscErrorCode MatZeroEntries_SeqAIJHIPSPARSE(Mat A)
2692: {
2693: PetscFunctionBegin;
2694: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::ZeroEntries(A));
2695: PetscFunctionReturn(PETSC_SUCCESS);
2696: }
2698: static PetscErrorCode MatGetCurrentMemType_SeqAIJHIPSPARSE(Mat A, PetscMemType *m)
2699: {
2700: PetscFunctionBegin;
2701: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetCurrentMemType(A, m));
2702: PetscFunctionReturn(PETSC_SUCCESS);
2703: }
2705: static PetscErrorCode MatBindToCPU_SeqAIJHIPSPARSE(Mat A, PetscBool flg)
2706: {
2707: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2709: PetscFunctionBegin;
2710: if (A->factortype != MAT_FACTOR_NONE) {
2711: A->boundtocpu = flg;
2712: PetscFunctionReturn(PETSC_SUCCESS);
2713: }
2714: if (flg) {
2715: PetscCall(MatSeqAIJHIPSPARSECopyFromGPU(A));
2717: A->ops->scale = MatScale_SeqAIJ;
2718: A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
2719: A->ops->diagonalscale = MatDiagonalScale_SeqAIJ;
2720: A->ops->axpy = MatAXPY_SeqAIJ;
2721: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
2722: A->ops->mult = MatMult_SeqAIJ;
2723: A->ops->multadd = MatMultAdd_SeqAIJ;
2724: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2725: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2726: A->ops->multhermitiantranspose = NULL;
2727: A->ops->multhermitiantransposeadd = NULL;
2728: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
2729: A->ops->getcurrentmemtype = NULL;
2730: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2731: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2732: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", NULL));
2733: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", NULL));
2734: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2735: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2736: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", NULL));
2737: } else {
2738: A->ops->scale = MatScale_SeqAIJHIPSPARSE;
2739: A->ops->getdiagonal = MatGetDiagonal_SeqAIJHIPSPARSE;
2740: A->ops->diagonalscale = MatDiagonalScale_SeqAIJHIPSPARSE;
2741: A->ops->axpy = MatAXPY_SeqAIJHIPSPARSE;
2742: A->ops->zeroentries = MatZeroEntries_SeqAIJHIPSPARSE;
2743: A->ops->mult = MatMult_SeqAIJHIPSPARSE;
2744: A->ops->multadd = MatMultAdd_SeqAIJHIPSPARSE;
2745: A->ops->multtranspose = MatMultTranspose_SeqAIJHIPSPARSE;
2746: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJHIPSPARSE;
2747: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJHIPSPARSE;
2748: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJHIPSPARSE;
2749: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJHIPSPARSE;
2750: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
2751: a->ops->getarray = MatSeqAIJGetArray_SeqAIJHIPSPARSE;
2752: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJHIPSPARSE;
2753: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJHIPSPARSE;
2754: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJHIPSPARSE;
2755: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJHIPSPARSE;
2756: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJHIPSPARSE;
2757: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJHIPSPARSE;
2759: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJHIPSPARSE));
2760: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdensehip_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2761: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqdense_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2762: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJHIPSPARSE));
2763: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJHIPSPARSE));
2764: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijhipsparse_seqaijhipsparse_C", MatProductSetFromOptions_SeqAIJHIPSPARSE));
2765: }
2766: A->boundtocpu = flg;
2767: a->inode.use = (flg && a->inode.size_csr) ? PETSC_TRUE : PETSC_FALSE;
2768: PetscFunctionReturn(PETSC_SUCCESS);
2769: }
2771: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJHIPSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
2772: {
2773: Mat B;
2775: PetscFunctionBegin;
2776: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); /* first use of HIPSPARSE may be via MatConvert */
2777: if (reuse == MAT_INITIAL_MATRIX) {
2778: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2779: } else if (reuse == MAT_REUSE_MATRIX) {
2780: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2781: }
2782: B = *newmat;
2783: PetscCall(PetscFree(B->defaultvectype));
2784: PetscCall(PetscStrallocpy(VECHIP, &B->defaultvectype));
2785: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2786: if (B->factortype == MAT_FACTOR_NONE) {
2787: Mat_SeqAIJHIPSPARSE *spptr;
2788: PetscCall(PetscNew(&spptr));
2789: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
2790: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
2791: spptr->format = MAT_HIPSPARSE_CSR;
2792: spptr->spmvAlg = HIPSPARSE_SPMV_CSR_ALG1;
2793: spptr->spmmAlg = HIPSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2794: //spptr->csr2cscAlg = HIPSPARSE_CSR2CSC_ALG1;
2796: B->spptr = spptr;
2797: } else {
2798: Mat_SeqAIJHIPSPARSETriFactors *spptr;
2800: PetscCall(PetscNew(&spptr));
2801: PetscCallHIPSPARSE(hipsparseCreate(&spptr->handle));
2802: PetscCallHIPSPARSE(hipsparseSetStream(spptr->handle, PetscDefaultHipStream));
2803: B->spptr = spptr;
2804: }
2805: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2806: }
2807: B->ops->assemblyend = MatAssemblyEnd_SeqAIJHIPSPARSE;
2808: B->ops->destroy = MatDestroy_SeqAIJHIPSPARSE;
2809: B->ops->setoption = MatSetOption_SeqAIJHIPSPARSE;
2810: B->ops->setfromoptions = MatSetFromOptions_SeqAIJHIPSPARSE;
2811: B->ops->bindtocpu = MatBindToCPU_SeqAIJHIPSPARSE;
2812: B->ops->duplicate = MatDuplicate_SeqAIJHIPSPARSE;
2813: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJHIPSPARSE;
2815: PetscCall(MatBindToCPU_SeqAIJHIPSPARSE(B, PETSC_FALSE));
2816: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJHIPSPARSE));
2817: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_SeqAIJHIPSPARSE));
2818: #if defined(PETSC_HAVE_HYPRE)
2819: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
2820: #endif
2821: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatHIPSPARSESetUseCPUSolve_C", MatHIPSPARSESetUseCPUSolve_SeqAIJHIPSPARSE));
2822: PetscFunctionReturn(PETSC_SUCCESS);
2823: }
2825: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJHIPSPARSE(Mat B)
2826: {
2827: PetscFunctionBegin;
2828: PetscCall(MatCreate_SeqAIJ(B));
2829: PetscCall(MatConvert_SeqAIJ_SeqAIJHIPSPARSE(B, MATSEQAIJHIPSPARSE, MAT_INPLACE_MATRIX, &B));
2830: PetscFunctionReturn(PETSC_SUCCESS);
2831: }
2833: /*MC
2834: MATSEQAIJHIPSPARSE - MATAIJHIPSPARSE = "(seq)aijhipsparse" - A matrix type to be used for sparse matrices on AMD GPUs
2836: A matrix type whose data resides on AMD GPUs. These matrices can be in either
2837: CSR, ELL, or Hybrid format.
2838: All matrix calculations are performed on AMD/NVIDIA GPUs using the HIPSPARSE library.
2840: Options Database Keys:
2841: + -mat_type aijhipsparse - sets the matrix type to `MATSEQAIJHIPSPARSE`
2842: . -mat_hipsparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2843: Other options include ell (ellpack) or hyb (hybrid).
2844: . -mat_hipsparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2845: - -mat_hipsparse_use_cpu_solve - Do `MatSolve()` on the CPU
2847: Level: beginner
2849: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJHIPSPARSE()`, `MATAIJHIPSPARSE`, `MatCreateAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
2850: M*/
2852: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_HIPSPARSE(void)
2853: {
2854: PetscFunctionBegin;
2855: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijhipsparse_hipsparse));
2856: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijhipsparse_hipsparse));
2857: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijhipsparse_hipsparse));
2858: PetscCall(MatSolverTypeRegister(MATSOLVERHIPSPARSE, MATSEQAIJHIPSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijhipsparse_hipsparse));
2859: PetscFunctionReturn(PETSC_SUCCESS);
2860: }
2862: static PetscErrorCode MatSeqAIJHIPSPARSE_Destroy(Mat mat)
2863: {
2864: Mat_SeqAIJHIPSPARSE *cusp = static_cast<Mat_SeqAIJHIPSPARSE *>(mat->spptr);
2866: PetscFunctionBegin;
2867: if (cusp) {
2868: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2869: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2870: delete cusp->workVector;
2871: delete cusp->rowoffsets_gpu;
2872: delete cusp->csr2csc_i;
2873: delete cusp->coords;
2874: if (cusp->handle) PetscCallHIPSPARSE(hipsparseDestroy(cusp->handle));
2875: PetscCall(PetscFree(mat->spptr));
2876: }
2877: PetscFunctionReturn(PETSC_SUCCESS);
2878: }
2880: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2881: {
2882: PetscFunctionBegin;
2883: if (*mat) {
2884: delete (*mat)->values;
2885: delete (*mat)->column_indices;
2886: delete (*mat)->row_offsets;
2887: delete *mat;
2888: *mat = 0;
2889: }
2890: PetscFunctionReturn(PETSC_SUCCESS);
2891: }
2893: static PetscErrorCode MatSeqAIJHIPSPARSEMultStruct_Destroy(Mat_SeqAIJHIPSPARSEMultStruct **matstruct, MatHIPSPARSEStorageFormat format)
2894: {
2895: CsrMatrix *mat;
2897: PetscFunctionBegin;
2898: if (*matstruct) {
2899: if ((*matstruct)->mat) {
2900: if (format == MAT_HIPSPARSE_ELL || format == MAT_HIPSPARSE_HYB) {
2901: hipsparseHybMat_t hybMat = (hipsparseHybMat_t)(*matstruct)->mat;
2902: PetscCallHIPSPARSE(hipsparseDestroyHybMat(hybMat));
2903: } else {
2904: mat = (CsrMatrix *)(*matstruct)->mat;
2905: PetscCall(CsrMatrix_Destroy(&mat));
2906: }
2907: }
2908: if ((*matstruct)->descr) PetscCallHIPSPARSE(hipsparseDestroyMatDescr((*matstruct)->descr));
2909: delete (*matstruct)->cprowIndices;
2910: PetscCallHIP(hipFree((*matstruct)->alpha_one));
2911: PetscCallHIP(hipFree((*matstruct)->beta_zero));
2912: PetscCallHIP(hipFree((*matstruct)->beta_one));
2914: Mat_SeqAIJHIPSPARSEMultStruct *mdata = *matstruct;
2915: if (mdata->matDescr) PetscCallHIPSPARSE(hipsparseDestroySpMat(mdata->matDescr));
2916: for (int i = 0; i < 3; i++) {
2917: if (mdata->hipSpMV[i].initialized) {
2918: PetscCallHIP(hipFree(mdata->hipSpMV[i].spmvBuffer));
2919: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecXDescr));
2920: PetscCallHIPSPARSE(hipsparseDestroyDnVec(mdata->hipSpMV[i].vecYDescr));
2921: }
2922: }
2923: delete *matstruct;
2924: *matstruct = NULL;
2925: }
2926: PetscFunctionReturn(PETSC_SUCCESS);
2927: }
2929: PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Reset(Mat_SeqAIJHIPSPARSETriFactors_p *trifactors)
2930: {
2931: Mat_SeqAIJHIPSPARSETriFactors *fs = *trifactors;
2933: PetscFunctionBegin;
2934: if (fs) {
2935: delete fs->rpermIndices;
2936: delete fs->cpermIndices;
2937: fs->rpermIndices = NULL;
2938: fs->cpermIndices = NULL;
2939: fs->init_dev_prop = PETSC_FALSE;
2940: PetscCallHIP(hipFree(fs->csrRowPtr));
2941: PetscCallHIP(hipFree(fs->csrColIdx));
2942: PetscCallHIP(hipFree(fs->csrRowPtr32));
2943: PetscCallHIP(hipFree(fs->csrColIdx32));
2944: PetscCallHIP(hipFree(fs->csrVal));
2945: PetscCallHIP(hipFree(fs->diag));
2946: PetscCallHIP(hipFree(fs->X));
2947: PetscCallHIP(hipFree(fs->Y));
2948: // PetscCallHIP(hipFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
2949: PetscCallHIP(hipFree(fs->spsvBuffer_L));
2950: PetscCallHIP(hipFree(fs->spsvBuffer_U));
2951: PetscCallHIP(hipFree(fs->spsvBuffer_Lt));
2952: PetscCallHIP(hipFree(fs->spsvBuffer_Ut));
2953: PetscCallHIPSPARSE(hipsparseDestroyMatDescr(fs->matDescr_M));
2954: if (fs->spMatDescr_L) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_L));
2955: if (fs->spMatDescr_U) PetscCallHIPSPARSE(hipsparseDestroySpMat(fs->spMatDescr_U));
2956: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_L));
2957: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Lt));
2958: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_U));
2959: PetscCallHIPSPARSE(hipsparseSpSV_destroyDescr(fs->spsvDescr_Ut));
2960: if (fs->dnVecDescr_X) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_X));
2961: if (fs->dnVecDescr_Y) PetscCallHIPSPARSE(hipsparseDestroyDnVec(fs->dnVecDescr_Y));
2962: PetscCallHIPSPARSE(hipsparseDestroyCsrilu02Info(fs->ilu0Info_M));
2963: PetscCallHIPSPARSE(hipsparseDestroyCsric02Info(fs->ic0Info_M));
2964: PetscCall(PetscFree(fs->csrRowPtr_h));
2965: PetscCall(PetscFree(fs->csrVal_h));
2966: PetscCall(PetscFree(fs->diag_h));
2967: fs->createdTransposeSpSVDescr = PETSC_FALSE;
2968: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
2969: }
2970: PetscFunctionReturn(PETSC_SUCCESS);
2971: }
2973: static PetscErrorCode MatSeqAIJHIPSPARSETriFactors_Destroy(Mat_SeqAIJHIPSPARSETriFactors **trifactors)
2974: {
2975: PetscFunctionBegin;
2976: if (*trifactors) {
2977: PetscCall(MatSeqAIJHIPSPARSETriFactors_Reset(trifactors));
2978: PetscCallHIPSPARSE(hipsparseDestroy((*trifactors)->handle));
2979: PetscCall(PetscFree(*trifactors));
2980: }
2981: PetscFunctionReturn(PETSC_SUCCESS);
2982: }
2984: static PetscErrorCode MatSeqAIJHIPSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
2985: {
2986: Mat_SeqAIJHIPSPARSE *cusp = (Mat_SeqAIJHIPSPARSE *)A->spptr;
2988: PetscFunctionBegin;
2989: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
2990: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
2991: if (destroy) {
2992: PetscCall(MatSeqAIJHIPSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2993: delete cusp->csr2csc_i;
2994: cusp->csr2csc_i = NULL;
2995: }
2996: A->transupdated = PETSC_FALSE;
2997: PetscFunctionReturn(PETSC_SUCCESS);
2998: }
3000: static PetscErrorCode MatSetPreallocationCOO_SeqAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3001: {
3002: PetscFunctionBegin;
3003: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3004: PetscFunctionReturn(PETSC_SUCCESS);
3005: }
3007: static PetscErrorCode MatSetValuesCOO_SeqAIJHIPSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3008: {
3009: PetscFunctionBegin;
3010: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3011: PetscFunctionReturn(PETSC_SUCCESS);
3012: }
3014: /*@C
3015: MatSeqAIJHIPSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJHIPSPARSE` matrices.
3017: Not Collective
3019: Input Parameters:
3020: + A - the matrix
3021: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3023: Output Parameters:
3024: + i - the CSR row pointers
3025: - j - the CSR column indices
3027: Level: developer
3029: Note:
3030: When compressed is true, the CSR structure does not contain empty rows
3032: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSERestoreIJ()`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3033: @*/
3034: PetscErrorCode MatSeqAIJHIPSPARSEGetIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3035: {
3036: PetscFunctionBegin;
3037: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3038: PetscFunctionReturn(PETSC_SUCCESS);
3039: }
3041: /*@C
3042: MatSeqAIJHIPSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJHIPSPARSEGetIJ()`
3044: Not Collective
3046: Input Parameters:
3047: + A - the matrix
3048: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3049: . i - the CSR row pointers
3050: - j - the CSR column indices
3052: Level: developer
3054: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetIJ()`
3055: @*/
3056: PetscErrorCode MatSeqAIJHIPSPARSERestoreIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3057: {
3058: PetscFunctionBegin;
3059: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3060: PetscFunctionReturn(PETSC_SUCCESS);
3061: }
3063: /*@C
3064: MatSeqAIJHIPSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3066: Not Collective
3068: Input Parameter:
3069: . A - a `MATSEQAIJHIPSPARSE` matrix
3071: Output Parameter:
3072: . a - pointer to the device data
3074: Level: developer
3076: Note:
3077: May trigger host-device copies if the up-to-date matrix data is on host
3079: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArrayRead()`
3080: @*/
3081: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayRead(Mat A, const PetscScalar *a[])
3082: {
3083: return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayRead(A, a);
3084: }
3086: /*@C
3087: MatSeqAIJHIPSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayRead()`
3089: Not Collective
3091: Input Parameters:
3092: + A - a `MATSEQAIJHIPSPARSE` matrix
3093: - a - pointer to the device data
3095: Level: developer
3097: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`
3098: @*/
3099: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayRead(Mat A, const PetscScalar *a[])
3100: {
3101: return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayRead(A, a);
3102: }
3104: /*@C
3105: MatSeqAIJHIPSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3107: Not Collective
3109: Input Parameter:
3110: . A - a `MATSEQAIJHIPSPARSE` matrix
3112: Output Parameter:
3113: . a - pointer to the device data
3115: Level: developer
3117: Note:
3118: May trigger host-device copies if up-to-date matrix data is on host
3120: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSEGetArrayWrite()`, `MatSeqAIJHIPSPARSERestoreArray()`
3121: @*/
3122: PetscErrorCode MatSeqAIJHIPSPARSEGetArray(Mat A, PetscScalar *a[])
3123: {
3124: return MatSeqAIJHIPSPARSE_CUPM_t::GetArray(A, a);
3125: }
3126: /*@C
3127: MatSeqAIJHIPSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJHIPSPARSEGetArray()`
3129: Not Collective
3131: Input Parameters:
3132: + A - a `MATSEQAIJHIPSPARSE` matrix
3133: - a - pointer to the device data
3135: Level: developer
3137: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`
3138: @*/
3139: PetscErrorCode MatSeqAIJHIPSPARSERestoreArray(Mat A, PetscScalar *a[])
3140: {
3141: return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArray(A, a);
3142: }
3144: /*@C
3145: MatSeqAIJHIPSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJHIPSPARSE` matrix is stored
3147: Not Collective
3149: Input Parameter:
3150: . A - a `MATSEQAIJHIPSPARSE` matrix
3152: Output Parameter:
3153: . a - pointer to the device data
3155: Level: developer
3157: Note:
3158: Does not trigger host-device copies and flags data validity on the GPU
3160: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArray()`, `MatSeqAIJHIPSPARSEGetArrayRead()`, `MatSeqAIJHIPSPARSERestoreArrayWrite()`
3161: @*/
3162: PetscErrorCode MatSeqAIJHIPSPARSEGetArrayWrite(Mat A, PetscScalar *a[])
3163: {
3164: return MatSeqAIJHIPSPARSE_CUPM_t::GetArrayWrite(A, a);
3165: }
3167: /*@C
3168: MatSeqAIJHIPSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJHIPSPARSEGetArrayWrite()`
3170: Not Collective
3172: Input Parameters:
3173: + A - a `MATSEQAIJHIPSPARSE` matrix
3174: - a - pointer to the device data
3176: Level: developer
3178: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJHIPSPARSEGetArrayWrite()`
3179: @*/
3180: PetscErrorCode MatSeqAIJHIPSPARSERestoreArrayWrite(Mat A, PetscScalar *a[])
3181: {
3182: return MatSeqAIJHIPSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3183: }
3185: struct IJCompare4 {
3186: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3187: {
3188: if (t1.get<0>() < t2.get<0>()) return true;
3189: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3190: return false;
3191: }
3192: };
3194: struct Shift {
3195: int _shift;
3197: Shift(int shift) : _shift(shift) { }
3198: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
3199: };
3201: /* merges two SeqAIJHIPSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3202: PetscErrorCode MatSeqAIJHIPSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3203: {
3204: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3205: Mat_SeqAIJHIPSPARSE *Acusp = (Mat_SeqAIJHIPSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJHIPSPARSE *)B->spptr, *Ccusp;
3206: Mat_SeqAIJHIPSPARSEMultStruct *Cmat;
3207: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3208: PetscInt Annz, Bnnz;
3209: PetscInt i, m, n, zero = 0;
3211: PetscFunctionBegin;
3214: PetscAssertPointer(C, 4);
3215: PetscCheckTypeName(A, MATSEQAIJHIPSPARSE);
3216: PetscCheckTypeName(B, MATSEQAIJHIPSPARSE);
3217: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3218: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3219: PetscCheck(Acusp->format != MAT_HIPSPARSE_ELL && Acusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3220: PetscCheck(Bcusp->format != MAT_HIPSPARSE_ELL && Bcusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3221: if (reuse == MAT_INITIAL_MATRIX) {
3222: m = A->rmap->n;
3223: n = A->cmap->n + B->cmap->n;
3224: PetscCall(MatCreate(PETSC_COMM_SELF, C));
3225: PetscCall(MatSetSizes(*C, m, n, m, n));
3226: PetscCall(MatSetType(*C, MATSEQAIJHIPSPARSE));
3227: c = (Mat_SeqAIJ *)(*C)->data;
3228: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
3229: Cmat = new Mat_SeqAIJHIPSPARSEMultStruct;
3230: Ccsr = new CsrMatrix;
3231: Cmat->cprowIndices = NULL;
3232: c->compressedrow.use = PETSC_FALSE;
3233: c->compressedrow.nrows = 0;
3234: c->compressedrow.i = NULL;
3235: c->compressedrow.rindex = NULL;
3236: Ccusp->workVector = NULL;
3237: Ccusp->nrows = m;
3238: Ccusp->mat = Cmat;
3239: Ccusp->mat->mat = Ccsr;
3240: Ccsr->num_rows = m;
3241: Ccsr->num_cols = n;
3242: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&Cmat->descr));
3243: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(Cmat->descr, HIPSPARSE_INDEX_BASE_ZERO));
3244: PetscCallHIPSPARSE(hipsparseSetMatType(Cmat->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
3245: PetscCallHIP(hipMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3246: PetscCallHIP(hipMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3247: PetscCallHIP(hipMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3248: PetscCallHIP(hipMemcpy(Cmat->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3249: PetscCallHIP(hipMemcpy(Cmat->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
3250: PetscCallHIP(hipMemcpy(Cmat->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3251: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3252: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
3253: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3254: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3256: Acsr = (CsrMatrix *)Acusp->mat->mat;
3257: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3258: Annz = (PetscInt)Acsr->column_indices->size();
3259: Bnnz = (PetscInt)Bcsr->column_indices->size();
3260: c->nz = Annz + Bnnz;
3261: Ccsr->row_offsets = new THRUSTINTARRAY(m + 1);
3262: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
3263: Ccsr->values = new THRUSTARRAY(c->nz);
3264: Ccsr->num_entries = c->nz;
3265: Ccusp->coords = new THRUSTINTARRAY(c->nz);
3266: if (c->nz) {
3267: auto Acoo = new THRUSTINTARRAY(Annz); // initialized with zeros
3268: auto Bcoo = new THRUSTINTARRAY(Bnnz);
3269: auto Ccoo = new THRUSTINTARRAY(c->nz);
3270: THRUSTINTARRAY *Aroff, *Broff;
3272: if (a->compressedrow.use) { /* need full row offset */
3273: if (!Acusp->rowoffsets_gpu) {
3274: Acusp->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
3275: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3276: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3277: }
3278: Aroff = Acusp->rowoffsets_gpu;
3279: } else Aroff = Acsr->row_offsets;
3280: if (b->compressedrow.use) { /* need full row offset */
3281: if (!Bcusp->rowoffsets_gpu) {
3282: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
3283: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3284: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3285: }
3286: Broff = Bcusp->rowoffsets_gpu;
3287: } else Broff = Bcsr->row_offsets;
3288: PetscCall(PetscLogGpuTimeBegin());
3289: // Implement cusparseXcsr2coo() with Thrust, as the former doesn't support 64-bit indices.
3290: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Aroff->data().get(), Acoo->data().get())));
3291: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Broff->data().get(), Bcoo->data().get())));
3293: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3294: auto Aperm = thrust::make_constant_iterator(1);
3295: auto Bperm = thrust::make_constant_iterator(0);
3296: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3297: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3298: auto wPerm = new THRUSTINTARRAY(Annz + Bnnz);
3299: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3300: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3301: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); // Use B column indices shifted by A->cmap->n
3302: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3303: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3304: auto p1 = Ccusp->coords->begin();
3305: auto p2 = Ccusp->coords->begin();
3306: thrust::advance(p2, Annz);
3307: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); // put nonzeros in A and B to C in sorted order (by row and then by column)
3308: auto cci = thrust::make_counting_iterator(zero);
3309: auto cce = thrust::make_counting_iterator(c->nz);
3310: auto pred = [](const int &x) { return x; };
3311: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3312: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3313: // Implement a simplified hipsparseXcoo2csr() with Thrust (assuming the row indices are already sorted), as the former doesn't support 64-bit indices.
3314: PetscCallThrust(thrust::lower_bound(thrust::device, Ccoo->begin(), Ccoo->end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m + 1), Ccsr->row_offsets->begin()));
3315: PetscCall(PetscLogGpuTimeEnd());
3316: delete wPerm;
3317: delete Acoo;
3318: delete Bcoo;
3319: delete Ccoo;
3320: PetscCallHIPSPARSE(hipsparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
3321: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3322: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(A));
3323: PetscCall(MatSeqAIJHIPSPARSEFormExplicitTranspose(B));
3324: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3325: Mat_SeqAIJHIPSPARSEMultStruct *CmatT = new Mat_SeqAIJHIPSPARSEMultStruct;
3326: CsrMatrix *CcsrT = new CsrMatrix;
3327: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3328: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3330: (*C)->form_explicit_transpose = PETSC_TRUE;
3331: (*C)->transupdated = PETSC_TRUE;
3332: Ccusp->rowoffsets_gpu = NULL;
3333: CmatT->cprowIndices = NULL;
3334: CmatT->mat = CcsrT;
3335: CcsrT->num_rows = n;
3336: CcsrT->num_cols = m;
3337: CcsrT->num_entries = c->nz;
3339: CcsrT->row_offsets = new THRUSTINTARRAY(n + 1);
3340: CcsrT->column_indices = new THRUSTINTARRAY(c->nz);
3341: CcsrT->values = new THRUSTARRAY(c->nz);
3343: PetscCall(PetscLogGpuTimeBegin());
3344: auto rT = CcsrT->row_offsets->begin();
3345: if (AT) {
3346: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3347: thrust::advance(rT, -1);
3348: }
3349: if (BT) {
3350: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3351: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3352: thrust::copy(titb, tite, rT);
3353: }
3354: auto cT = CcsrT->column_indices->begin();
3355: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3356: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3357: auto vT = CcsrT->values->begin();
3358: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3359: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3360: PetscCall(PetscLogGpuTimeEnd());
3362: PetscCallHIPSPARSE(hipsparseCreateMatDescr(&CmatT->descr));
3363: PetscCallHIPSPARSE(hipsparseSetMatIndexBase(CmatT->descr, HIPSPARSE_INDEX_BASE_ZERO));
3364: PetscCallHIPSPARSE(hipsparseSetMatType(CmatT->descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
3365: PetscCallHIP(hipMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3366: PetscCallHIP(hipMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3367: PetscCallHIP(hipMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3368: PetscCallHIP(hipMemcpy(CmatT->alpha_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3369: PetscCallHIP(hipMemcpy(CmatT->beta_zero, &PETSC_HIPSPARSE_ZERO, sizeof(PetscScalar), hipMemcpyHostToDevice));
3370: PetscCallHIP(hipMemcpy(CmatT->beta_one, &PETSC_HIPSPARSE_ONE, sizeof(PetscScalar), hipMemcpyHostToDevice));
3371: PetscCallHIPSPARSE(hipsparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), csrRowOffsetsType, csrColIndType, HIPSPARSE_INDEX_BASE_ZERO, hipsparse_scalartype));
3372: Ccusp->matTranspose = CmatT;
3373: }
3374: }
3376: c->free_a = PETSC_TRUE;
3377: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3378: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3379: c->free_ij = PETSC_TRUE;
3380: PetscCallHIP(hipMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
3381: PetscCallHIP(hipMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), hipMemcpyDeviceToHost));
3382: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3383: PetscCall(PetscMalloc1(m, &c->ilen));
3384: PetscCall(PetscMalloc1(m, &c->imax));
3385: c->maxnz = c->nz;
3386: c->nonzerorowcnt = 0;
3387: c->rmax = 0;
3388: for (i = 0; i < m; i++) {
3389: const PetscInt nn = c->i[i + 1] - c->i[i];
3390: c->ilen[i] = c->imax[i] = nn;
3391: c->nonzerorowcnt += (PetscInt)!!nn;
3392: c->rmax = PetscMax(c->rmax, nn);
3393: }
3394: PetscCall(PetscMalloc1(c->nz, &c->a));
3395: (*C)->nonzerostate++;
3396: PetscCall(PetscLayoutSetUp((*C)->rmap));
3397: PetscCall(PetscLayoutSetUp((*C)->cmap));
3398: Ccusp->nonzerostate = (*C)->nonzerostate;
3399: (*C)->preallocated = PETSC_TRUE;
3400: } else {
3401: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3402: c = (Mat_SeqAIJ *)(*C)->data;
3403: if (c->nz) {
3404: Ccusp = (Mat_SeqAIJHIPSPARSE *)(*C)->spptr;
3405: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3406: PetscCheck(Ccusp->format != MAT_HIPSPARSE_ELL && Ccusp->format != MAT_HIPSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3407: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3408: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(A));
3409: PetscCall(MatSeqAIJHIPSPARSECopyToGPU(B));
3410: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3411: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJHIPSPARSEMultStruct");
3412: Acsr = (CsrMatrix *)Acusp->mat->mat;
3413: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3414: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3415: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3416: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3417: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3418: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3419: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3420: auto pmid = Ccusp->coords->begin();
3421: thrust::advance(pmid, Acsr->num_entries);
3422: PetscCall(PetscLogGpuTimeBegin());
3423: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3424: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3425: thrust::for_each(zibait, zieait, VecHIPEquals());
3426: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3427: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3428: thrust::for_each(zibbit, ziebit, VecHIPEquals());
3429: PetscCall(MatSeqAIJHIPSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3430: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3431: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJHIPSPARSEMultStruct");
3432: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3433: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3434: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3435: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3436: auto vT = CcsrT->values->begin();
3437: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3438: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3439: (*C)->transupdated = PETSC_TRUE;
3440: }
3441: PetscCall(PetscLogGpuTimeEnd());
3442: }
3443: }
3444: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3445: (*C)->assembled = PETSC_TRUE;
3446: (*C)->was_assembled = PETSC_FALSE;
3447: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
3448: PetscFunctionReturn(PETSC_SUCCESS);
3449: }
3451: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJHIPSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3452: {
3453: PetscFunctionBegin;
3454: PetscCall(MatSeqAIJHIPSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3455: PetscFunctionReturn(PETSC_SUCCESS);
3456: }