Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
15: #include <thrust/adjacent_difference.h>
16: #if PETSC_CPP_VERSION >= 14
17: #define PETSC_HAVE_THRUST_ASYNC 1
18: // thrust::for_each(thrust::cuda::par.on()) requires C++14
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/remove.h>
22: #include <thrust/sort.h>
23: #include <thrust/tuple.h>
24: #include <thrust/unique.h>
25: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0)
26: #include <cuda/std/functional>
27: #endif
29: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
30: /*
31: The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
32: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
33: */
34: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
35: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
36: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
43: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
44: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
45: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
46: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
47: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
48: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
49: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
50: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
51: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
52: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
54: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
55: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
56: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
57: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
59: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
60: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
62: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
63: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
64: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
65: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
67: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
68: {
69: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
71: PetscFunctionBegin;
72: switch (op) {
73: case MAT_CUSPARSE_MULT:
74: cusparsestruct->format = format;
75: break;
76: case MAT_CUSPARSE_ALL:
77: cusparsestruct->format = format;
78: break;
79: default:
80: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
81: }
82: PetscFunctionReturn(PETSC_SUCCESS);
83: }
85: /*@
86: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
87: operation. Only the `MatMult()` operation can use different GPU storage formats
89: Not Collective
91: Input Parameters:
92: + A - Matrix of type `MATSEQAIJCUSPARSE`
93: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
94: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
95: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
97: Level: intermediate
99: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
100: @*/
101: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
102: {
103: PetscFunctionBegin;
105: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
106: PetscFunctionReturn(PETSC_SUCCESS);
107: }
109: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
110: {
111: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
113: PetscFunctionBegin;
114: cusparsestruct->use_cpu_solve = use_cpu;
115: PetscFunctionReturn(PETSC_SUCCESS);
116: }
118: /*@
119: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
121: Input Parameters:
122: + A - Matrix of type `MATSEQAIJCUSPARSE`
123: - use_cpu - set flag for using the built-in CPU `MatSolve()`
125: Level: intermediate
127: Note:
128: The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
129: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
130: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
132: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133: @*/
134: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
135: {
136: PetscFunctionBegin;
138: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
139: PetscFunctionReturn(PETSC_SUCCESS);
140: }
142: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
143: {
144: PetscFunctionBegin;
145: switch (op) {
146: case MAT_FORM_EXPLICIT_TRANSPOSE:
147: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
148: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
149: A->form_explicit_transpose = flg;
150: break;
151: default:
152: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
153: break;
154: }
155: PetscFunctionReturn(PETSC_SUCCESS);
156: }
158: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
159: {
160: MatCUSPARSEStorageFormat format;
161: PetscBool flg;
162: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
164: PetscFunctionBegin;
165: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
166: if (A->factortype == MAT_FACTOR_NONE) {
167: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
168: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
170: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
171: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
172: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
173: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
174: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
175: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
176: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
177: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
178: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
179: PetscCall(
180: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
181: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
182: }
183: PetscOptionsHeadEnd();
184: PetscFunctionReturn(PETSC_SUCCESS);
185: }
187: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
188: {
189: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
190: PetscInt m = A->rmap->n;
191: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
192: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
193: const MatScalar *Aa = a->a;
194: PetscInt *Mi, *Mj, Mnz;
195: PetscScalar *Ma;
197: PetscFunctionBegin;
198: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
199: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
200: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
201: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
202: Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
203: PetscCall(PetscMalloc1(m + 1, &Mi));
204: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
205: PetscCall(PetscMalloc1(Mnz, &Ma));
206: Mi[0] = 0;
207: for (PetscInt i = 0; i < m; i++) {
208: PetscInt llen = Ai[i + 1] - Ai[i];
209: PetscInt ulen = adiag[i] - adiag[i + 1];
210: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
211: Mj[Mi[i] + llen] = i; // diagonal entry
212: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
213: Mi[i + 1] = Mi[i] + llen + ulen;
214: }
215: // Copy M (L,U) from host to device
216: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
217: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
218: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
219: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
220: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
222: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
223: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
224: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
225: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
226: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
227: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
228: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
229: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
231: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
232: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
233: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
235: fillMode = CUSPARSE_FILL_MODE_UPPER;
236: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
237: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
238: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
239: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
241: // Allocate work vectors in SpSv
242: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
243: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
245: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
246: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
248: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
249: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
250: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
251: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
252: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
253: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
254: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
256: // Record for reuse
257: fs->csrRowPtr_h = Mi;
258: fs->csrVal_h = Ma;
259: PetscCall(PetscFree(Mj));
260: }
261: // Copy the value
262: Mi = fs->csrRowPtr_h;
263: Ma = fs->csrVal_h;
264: Mnz = Mi[m];
265: for (PetscInt i = 0; i < m; i++) {
266: PetscInt llen = Ai[i + 1] - Ai[i];
267: PetscInt ulen = adiag[i] - adiag[i + 1];
268: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
269: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
270: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
271: }
272: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
274: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
275: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
276: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
277: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
278: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
279: } else
280: #endif
281: {
282: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
283: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
285: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
286: fs->updatedSpSVAnalysis = PETSC_TRUE;
287: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
288: }
289: }
290: PetscFunctionReturn(PETSC_SUCCESS);
291: }
293: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
294: {
295: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
296: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
297: IS isrow = a->row, isicol = a->icol;
298: PetscBool row_identity, col_identity;
299: PetscInt n = A->rmap->n;
301: PetscFunctionBegin;
302: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
303: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
305: cusparseTriFactors->nnz = a->nz;
307: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
308: /* lower triangular indices */
309: PetscCall(ISIdentity(isrow, &row_identity));
310: if (!row_identity && !cusparseTriFactors->rpermIndices) {
311: const PetscInt *r;
313: PetscCall(ISGetIndices(isrow, &r));
314: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
315: cusparseTriFactors->rpermIndices->assign(r, r + n);
316: PetscCall(ISRestoreIndices(isrow, &r));
317: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
318: }
320: /* upper triangular indices */
321: PetscCall(ISIdentity(isicol, &col_identity));
322: if (!col_identity && !cusparseTriFactors->cpermIndices) {
323: const PetscInt *c;
325: PetscCall(ISGetIndices(isicol, &c));
326: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
327: cusparseTriFactors->cpermIndices->assign(c, c + n);
328: PetscCall(ISRestoreIndices(isicol, &c));
329: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
330: }
331: PetscFunctionReturn(PETSC_SUCCESS);
332: }
334: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
335: {
336: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
337: PetscInt m = A->rmap->n;
338: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
339: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
340: const MatScalar *Aa = a->a;
341: PetscInt *Mj, Mnz;
342: PetscScalar *Ma, *D;
344: PetscFunctionBegin;
345: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
346: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
347: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
348: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
349: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
350: Mnz = Ai[m]; // Unz (with the unit diagonal)
351: PetscCall(PetscMalloc1(Mnz, &Ma));
352: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
353: PetscCall(PetscMalloc1(m, &D)); // the diagonal
354: for (PetscInt i = 0; i < m; i++) {
355: PetscInt ulen = Ai[i + 1] - Ai[i];
356: Mj[Ai[i]] = i; // diagonal entry
357: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
358: }
359: // Copy M (U) from host to device
360: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
361: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
362: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
363: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
364: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
365: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
367: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
368: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
369: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
370: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
371: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
372: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
373: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
374: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
376: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
377: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
378: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
380: // Allocate work vectors in SpSv
381: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
382: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
384: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
385: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
387: // Query buffer sizes for SpSV and then allocate buffers
388: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
389: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
390: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
392: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
393: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
394: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
396: // Record for reuse
397: fs->csrVal_h = Ma;
398: fs->diag_h = D;
399: PetscCall(PetscFree(Mj));
400: }
401: // Copy the value
402: Ma = fs->csrVal_h;
403: D = fs->diag_h;
404: Mnz = Ai[m];
405: for (PetscInt i = 0; i < m; i++) {
406: D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
407: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
408: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
409: }
410: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
411: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
413: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
414: if (fs->updatedSpSVAnalysis) {
415: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
416: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
417: } else
418: #endif
419: {
420: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
421: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
422: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
423: fs->updatedSpSVAnalysis = PETSC_TRUE;
424: }
425: }
426: PetscFunctionReturn(PETSC_SUCCESS);
427: }
429: // Solve Ut D U x = b
430: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
431: {
432: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
433: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
434: const PetscScalar *barray;
435: PetscScalar *xarray;
436: thrust::device_ptr<const PetscScalar> bGPU;
437: thrust::device_ptr<PetscScalar> xGPU;
438: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
439: PetscInt m = A->rmap->n;
441: PetscFunctionBegin;
442: PetscCall(PetscLogGpuTimeBegin());
443: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
444: PetscCall(VecCUDAGetArrayRead(b, &barray));
445: xGPU = thrust::device_pointer_cast(xarray);
446: bGPU = thrust::device_pointer_cast(barray);
448: // Reorder b with the row permutation if needed, and wrap the result in fs->X
449: if (fs->rpermIndices) {
450: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
451: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
452: } else {
453: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
454: }
456: // Solve Ut Y = X
457: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
458: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
460: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
461: // It is basically a vector element-wise multiplication, but cublas does not have it!
462: #if CCCL_VERSION >= 3001000
463: auto multiplies = cuda::std::multiplies<PetscScalar>();
464: #else
465: auto multiplies = thrust::multiplies<PetscScalar>();
466: #endif
467: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));
469: // Solve U X = Y
470: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
471: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
472: } else {
473: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
474: }
475: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
477: // Reorder X with the column permutation if needed, and put the result back to x
478: if (fs->cpermIndices) {
479: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
480: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
481: }
483: PetscCall(VecCUDARestoreArrayRead(b, &barray));
484: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
485: PetscCall(PetscLogGpuTimeEnd());
486: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
487: PetscFunctionReturn(PETSC_SUCCESS);
488: }
490: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
491: {
492: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
493: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
494: IS ip = a->row;
495: PetscBool perm_identity;
496: PetscInt n = A->rmap->n;
498: PetscFunctionBegin;
499: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
501: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
502: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
504: A->offloadmask = PETSC_OFFLOAD_BOTH;
506: /* lower triangular indices */
507: PetscCall(ISIdentity(ip, &perm_identity));
508: if (!perm_identity) {
509: IS iip;
510: const PetscInt *irip, *rip;
512: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
513: PetscCall(ISGetIndices(iip, &irip));
514: PetscCall(ISGetIndices(ip, &rip));
515: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
516: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
517: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
518: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
519: PetscCall(ISRestoreIndices(iip, &irip));
520: PetscCall(ISDestroy(&iip));
521: PetscCall(ISRestoreIndices(ip, &rip));
522: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
523: }
524: PetscFunctionReturn(PETSC_SUCCESS);
525: }
527: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
528: {
529: PetscFunctionBegin;
530: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
531: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
532: B->offloadmask = PETSC_OFFLOAD_CPU;
533: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
534: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
535: B->ops->matsolve = NULL;
536: B->ops->matsolvetranspose = NULL;
537: /* get the triangular factors */
538: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
539: PetscFunctionReturn(PETSC_SUCCESS);
540: }
542: struct PetscScalarToPetscInt {
543: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
544: };
546: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
547: {
548: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
549: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
550: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
551: cusparseStatus_t stat;
552: cusparseIndexBase_t indexBase;
554: PetscFunctionBegin;
555: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
556: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
557: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
558: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
559: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
560: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
561: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
562: PetscCall(PetscLogGpuTimeBegin());
563: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
564: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
565: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
566: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
567: indexBase = cusparseGetMatIndexBase(matstruct->descr);
568: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
569: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
571: /* set alpha and beta */
572: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
573: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
574: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
575: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
576: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
577: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
579: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
580: CsrMatrix *matrixT = new CsrMatrix;
581: matstructT->mat = matrixT;
582: matrixT->num_rows = A->cmap->n;
583: matrixT->num_cols = A->rmap->n;
584: matrixT->num_entries = a->nz;
585: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
586: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
587: matrixT->values = new THRUSTARRAY(a->nz);
589: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
590: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
591: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
592: indexBase, cusparse_scalartype);
593: PetscCallCUSPARSE(stat);
594: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
595: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
596: }
597: }
598: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
599: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
600: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
601: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
602: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
603: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
604: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
605: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
606: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
607: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
608: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
609: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
610: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
611: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
612: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
613: }
614: if (!cusparsestruct->csr2csc_i) {
615: THRUSTARRAY csr2csc_a(matrix->num_entries);
616: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
618: indexBase = cusparseGetMatIndexBase(matstruct->descr);
619: void *csr2cscBuffer;
620: size_t csr2cscBufferSize;
621: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
622: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
623: PetscCallCUSPARSE(stat);
624: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
626: if (matrix->num_entries) {
627: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
628: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
629: I checked every parameters and they were just fine. I have no clue why cusparse complains.
631: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
632: should be filled with indexBase. So I just take a shortcut here.
633: */
634: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
635: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
636: PetscCallCUSPARSE(stat);
637: } else {
638: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
639: }
641: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
642: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
643: PetscCallCUDA(cudaFree(csr2cscBuffer));
644: }
645: PetscCallThrust(
646: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
647: }
648: PetscCall(PetscLogGpuTimeEnd());
649: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
650: /* the compressed row indices is not used for matTranspose */
651: matstructT->cprowIndices = NULL;
652: /* assign the pointer */
653: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
654: A->transupdated = PETSC_TRUE;
655: PetscFunctionReturn(PETSC_SUCCESS);
656: }
658: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
659: {
660: const PetscScalar *barray;
661: PetscScalar *xarray;
662: thrust::device_ptr<const PetscScalar> bGPU;
663: thrust::device_ptr<PetscScalar> xGPU;
664: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
665: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
666: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
667: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
668: PetscInt m = A->rmap->n;
670: PetscFunctionBegin;
671: PetscCall(PetscLogGpuTimeBegin());
672: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
673: PetscCall(VecCUDAGetArrayRead(b, &barray));
674: xGPU = thrust::device_pointer_cast(xarray);
675: bGPU = thrust::device_pointer_cast(barray);
677: // Reorder b with the row permutation if needed, and wrap the result in fs->X
678: if (fs->rpermIndices) {
679: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
680: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
681: } else {
682: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
683: }
685: // Solve L Y = X
686: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
687: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
688: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
690: // Solve U X = Y
691: if (fs->cpermIndices) {
692: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
693: } else {
694: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
695: }
696: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
698: // Reorder X with the column permutation if needed, and put the result back to x
699: if (fs->cpermIndices) {
700: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
701: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
702: }
703: PetscCall(VecCUDARestoreArrayRead(b, &barray));
704: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
705: PetscCall(PetscLogGpuTimeEnd());
706: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
707: PetscFunctionReturn(PETSC_SUCCESS);
708: }
710: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
711: {
712: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
713: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
714: const PetscScalar *barray;
715: PetscScalar *xarray;
716: thrust::device_ptr<const PetscScalar> bGPU;
717: thrust::device_ptr<PetscScalar> xGPU;
718: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
719: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
720: PetscInt m = A->rmap->n;
722: PetscFunctionBegin;
723: PetscCall(PetscLogGpuTimeBegin());
724: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
725: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
726: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
727: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
729: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
730: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
731: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
732: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
733: fs->createdTransposeSpSVDescr = PETSC_TRUE;
734: }
736: if (!fs->updatedTransposeSpSVAnalysis) {
737: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
739: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
740: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
741: }
743: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
744: PetscCall(VecCUDAGetArrayRead(b, &barray));
745: xGPU = thrust::device_pointer_cast(xarray);
746: bGPU = thrust::device_pointer_cast(barray);
748: // Reorder b with the row permutation if needed, and wrap the result in fs->X
749: if (fs->rpermIndices) {
750: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
751: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
752: } else {
753: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
754: }
756: // Solve Ut Y = X
757: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
758: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
760: // Solve Lt X = Y
761: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
762: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
763: } else {
764: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
765: }
766: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
768: // Reorder X with the column permutation if needed, and put the result back to x
769: if (fs->cpermIndices) {
770: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
771: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
772: }
774: PetscCall(VecCUDARestoreArrayRead(b, &barray));
775: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
776: PetscCall(PetscLogGpuTimeEnd());
777: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
778: PetscFunctionReturn(PETSC_SUCCESS);
779: }
781: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
782: {
783: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
784: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
785: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
786: CsrMatrix *Acsr;
787: PetscInt m, nz;
788: PetscBool flg;
790: PetscFunctionBegin;
791: if (PetscDefined(USE_DEBUG)) {
792: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
793: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
794: }
796: /* Copy A's value to fact */
797: m = fact->rmap->n;
798: nz = aij->nz;
799: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
800: Acsr = (CsrMatrix *)Acusp->mat->mat;
801: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
803: PetscCall(PetscLogGpuTimeBegin());
804: /* Factorize fact inplace */
805: if (m)
806: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
807: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
808: if (PetscDefined(USE_DEBUG)) {
809: int numerical_zero;
810: cusparseStatus_t status;
811: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
812: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
813: }
815: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
816: if (fs->updatedSpSVAnalysis) {
817: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
818: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
819: } else
820: #endif
821: {
822: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
823: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
824: */
825: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
827: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
829: fs->updatedSpSVAnalysis = PETSC_TRUE;
830: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
831: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
832: }
834: fact->offloadmask = PETSC_OFFLOAD_GPU;
835: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
836: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
837: fact->ops->matsolve = NULL;
838: fact->ops->matsolvetranspose = NULL;
839: PetscCall(PetscLogGpuTimeEnd());
840: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
841: PetscFunctionReturn(PETSC_SUCCESS);
842: }
844: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
845: {
846: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
847: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
848: PetscInt m, nz;
850: PetscFunctionBegin;
851: if (PetscDefined(USE_DEBUG)) {
852: PetscBool flg, diagDense;
854: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
855: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
856: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
857: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
858: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
859: }
861: /* Free the old stale stuff */
862: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
864: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
865: but they will not be used. Allocate them just for easy debugging.
866: */
867: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
869: fact->offloadmask = PETSC_OFFLOAD_BOTH;
870: fact->factortype = MAT_FACTOR_ILU;
871: fact->info.factor_mallocs = 0;
872: fact->info.fill_ratio_given = info->fill;
873: fact->info.fill_ratio_needed = 1.0;
875: aij->row = NULL;
876: aij->col = NULL;
878: /* ====================================================================== */
879: /* Copy A's i, j to fact and also allocate the value array of fact. */
880: /* We'll do in-place factorization on fact */
881: /* ====================================================================== */
882: const int *Ai, *Aj;
884: m = fact->rmap->n;
885: nz = aij->nz;
887: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
888: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
889: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
890: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
891: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
892: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
894: /* ====================================================================== */
895: /* Create descriptors for M, L, U */
896: /* ====================================================================== */
897: cusparseFillMode_t fillMode;
898: cusparseDiagType_t diagType;
900: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
901: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
902: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
904: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
905: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
906: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
907: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
908: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
909: */
910: fillMode = CUSPARSE_FILL_MODE_LOWER;
911: diagType = CUSPARSE_DIAG_TYPE_UNIT;
912: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
913: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
914: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
916: fillMode = CUSPARSE_FILL_MODE_UPPER;
917: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
918: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
919: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
920: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
922: /* ========================================================================= */
923: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
924: /* ========================================================================= */
925: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
926: if (m)
927: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
928: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
930: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
931: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
933: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
934: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
936: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
937: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
939: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
940: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
942: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
943: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
944: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
945: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
946: */
947: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
948: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
949: fs->spsvBuffer_L = fs->factBuffer_M;
950: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
951: } else {
952: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
953: fs->spsvBuffer_U = fs->factBuffer_M;
954: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
955: }
957: /* ========================================================================== */
958: /* Perform analysis of ilu0 on M, SpSv on L and U */
959: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
960: /* ========================================================================== */
961: int structural_zero;
962: cusparseStatus_t status;
964: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
965: if (m)
966: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
967: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
968: if (PetscDefined(USE_DEBUG)) {
969: /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
970: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
971: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
972: }
974: /* Estimate FLOPs of the numeric factorization */
975: {
976: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
977: PetscInt *Ai, nzRow, nzLeft;
978: const PetscInt *adiag;
979: PetscLogDouble flops = 0.0;
981: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
982: Ai = Aseq->i;
983: for (PetscInt i = 0; i < m; i++) {
984: if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
985: nzRow = Ai[i + 1] - Ai[i];
986: nzLeft = adiag[i] - Ai[i];
987: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
988: and include the eliminated one will be updated, which incurs a multiplication and an addition.
989: */
990: nzLeft = (nzRow - 1) / 2;
991: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
992: }
993: }
994: fs->numericFactFlops = flops;
995: }
996: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
997: PetscFunctionReturn(PETSC_SUCCESS);
998: }
1000: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1001: {
1002: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1003: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1004: const PetscScalar *barray;
1005: PetscScalar *xarray;
1007: PetscFunctionBegin;
1008: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1009: PetscCall(VecCUDAGetArrayRead(b, &barray));
1010: PetscCall(PetscLogGpuTimeBegin());
1012: /* Solve L*y = b */
1013: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1014: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1015: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1016: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1018: /* Solve Lt*x = y */
1019: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1020: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1021: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1023: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1024: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1026: PetscCall(PetscLogGpuTimeEnd());
1027: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1028: PetscFunctionReturn(PETSC_SUCCESS);
1029: }
1031: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1032: {
1033: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1034: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1035: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1036: CsrMatrix *Acsr;
1037: PetscInt m, nz;
1038: PetscBool flg;
1040: PetscFunctionBegin;
1041: if (PetscDefined(USE_DEBUG)) {
1042: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1043: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1044: }
1046: /* Copy A's value to fact */
1047: m = fact->rmap->n;
1048: nz = aij->nz;
1049: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1050: Acsr = (CsrMatrix *)Acusp->mat->mat;
1051: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1053: /* Factorize fact inplace */
1054: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1055: csric02() only takes the lower triangular part of matrix A to perform factorization.
1056: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1057: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1058: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1059: */
1060: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1061: if (PetscDefined(USE_DEBUG)) {
1062: int numerical_zero;
1063: cusparseStatus_t status;
1064: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1065: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1066: }
1068: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1069: if (fs->updatedSpSVAnalysis) {
1070: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1071: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1072: } else
1073: #endif
1074: {
1075: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1077: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1078: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1079: */
1080: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1081: fs->updatedSpSVAnalysis = PETSC_TRUE;
1082: }
1084: fact->offloadmask = PETSC_OFFLOAD_GPU;
1085: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1086: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1087: fact->ops->matsolve = NULL;
1088: fact->ops->matsolvetranspose = NULL;
1089: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1090: PetscFunctionReturn(PETSC_SUCCESS);
1091: }
1093: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1094: {
1095: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1096: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1097: PetscInt m, nz;
1099: PetscFunctionBegin;
1100: if (PetscDefined(USE_DEBUG)) {
1101: PetscBool flg, diagDense;
1103: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1104: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1105: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1106: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1107: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1108: }
1110: /* Free the old stale stuff */
1111: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1113: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1114: but they will not be used. Allocate them just for easy debugging.
1115: */
1116: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1118: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1119: fact->factortype = MAT_FACTOR_ICC;
1120: fact->info.factor_mallocs = 0;
1121: fact->info.fill_ratio_given = info->fill;
1122: fact->info.fill_ratio_needed = 1.0;
1124: aij->row = NULL;
1125: aij->col = NULL;
1127: /* ====================================================================== */
1128: /* Copy A's i, j to fact and also allocate the value array of fact. */
1129: /* We'll do in-place factorization on fact */
1130: /* ====================================================================== */
1131: const int *Ai, *Aj;
1133: m = fact->rmap->n;
1134: nz = aij->nz;
1136: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1137: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1138: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1139: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1140: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1141: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1143: /* ====================================================================== */
1144: /* Create mat descriptors for M, L */
1145: /* ====================================================================== */
1146: cusparseFillMode_t fillMode;
1147: cusparseDiagType_t diagType;
1149: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1150: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1151: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1153: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1154: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1155: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1156: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1157: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1158: */
1159: fillMode = CUSPARSE_FILL_MODE_LOWER;
1160: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1161: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1162: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1163: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1165: /* ========================================================================= */
1166: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1167: /* ========================================================================= */
1168: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1169: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
1171: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1172: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1174: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1175: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1177: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1178: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1180: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1181: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1183: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1184: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1185: */
1186: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1187: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1188: fs->spsvBuffer_L = fs->factBuffer_M;
1189: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1190: } else {
1191: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1192: fs->spsvBuffer_Lt = fs->factBuffer_M;
1193: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1194: }
1196: /* ========================================================================== */
1197: /* Perform analysis of ic0 on M */
1198: /* The lower triangular part of M has the same sparsity pattern as L */
1199: /* ========================================================================== */
1200: int structural_zero;
1201: cusparseStatus_t status;
1203: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1204: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1205: if (PetscDefined(USE_DEBUG)) {
1206: /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1207: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1208: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1209: }
1211: /* Estimate FLOPs of the numeric factorization */
1212: {
1213: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1214: PetscInt *Ai, nzRow, nzLeft;
1215: PetscLogDouble flops = 0.0;
1217: Ai = Aseq->i;
1218: for (PetscInt i = 0; i < m; i++) {
1219: nzRow = Ai[i + 1] - Ai[i];
1220: if (nzRow > 1) {
1221: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1222: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1223: */
1224: nzLeft = (nzRow - 1) / 2;
1225: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1226: }
1227: }
1228: fs->numericFactFlops = flops;
1229: }
1230: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1231: PetscFunctionReturn(PETSC_SUCCESS);
1232: }
1234: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1235: {
1236: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
1237: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1239: PetscFunctionBegin;
1240: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1241: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1242: B->offloadmask = PETSC_OFFLOAD_CPU;
1244: if (!cusparsestruct->use_cpu_solve) {
1245: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
1246: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1247: }
1248: B->ops->matsolve = NULL;
1249: B->ops->matsolvetranspose = NULL;
1251: /* get the triangular factors */
1252: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
1253: PetscFunctionReturn(PETSC_SUCCESS);
1254: }
1256: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1257: {
1258: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
1260: PetscFunctionBegin;
1261: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1262: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1263: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1264: PetscFunctionReturn(PETSC_SUCCESS);
1265: }
1267: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1268: {
1269: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1271: PetscFunctionBegin;
1272: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1273: if (!info->factoronhost) {
1274: PetscCall(ISIdentity(isrow, &row_identity));
1275: PetscCall(ISIdentity(iscol, &col_identity));
1276: }
1277: if (!info->levels && row_identity && col_identity) {
1278: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1279: } else {
1280: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1281: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1282: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1283: }
1284: PetscFunctionReturn(PETSC_SUCCESS);
1285: }
1287: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1288: {
1289: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1291: PetscFunctionBegin;
1292: PetscBool perm_identity = PETSC_FALSE;
1293: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1294: if (!info->levels && perm_identity) {
1295: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1296: } else {
1297: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1298: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1299: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1300: }
1301: PetscFunctionReturn(PETSC_SUCCESS);
1302: }
1304: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1305: {
1306: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1308: PetscFunctionBegin;
1309: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1310: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1311: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1312: PetscFunctionReturn(PETSC_SUCCESS);
1313: }
1315: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1316: {
1317: PetscFunctionBegin;
1318: *type = MATSOLVERCUSPARSE;
1319: PetscFunctionReturn(PETSC_SUCCESS);
1320: }
1322: /*MC
1323: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1324: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1325: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1326: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1327: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1328: algorithms are not recommended. This class does NOT support direct solver operations.
1330: Level: beginner
1332: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1333: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1334: M*/
1336: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1337: {
1338: PetscInt n = A->rmap->n;
1340: PetscFunctionBegin;
1341: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1342: PetscCall(MatSetSizes(*B, n, n, n, n));
1343: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
1344: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1346: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1347: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1348: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1349: if (!A->boundtocpu) {
1350: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1351: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1352: } else {
1353: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1354: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1355: }
1356: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1357: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1358: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1359: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1360: if (!A->boundtocpu) {
1361: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1362: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1363: } else {
1364: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1365: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1366: }
1367: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1368: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1369: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
1371: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1372: (*B)->canuseordering = PETSC_TRUE;
1373: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
1374: PetscFunctionReturn(PETSC_SUCCESS);
1375: }
1377: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1378: {
1379: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1380: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1381: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1383: PetscFunctionBegin;
1384: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1385: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1386: if (A->factortype == MAT_FACTOR_NONE) {
1387: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1388: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1389: } else if (fs->csrVal) {
1390: /* We have a factorized matrix on device and are able to copy it to host */
1391: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1392: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1393: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1394: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1395: A->offloadmask = PETSC_OFFLOAD_BOTH;
1396: }
1397: PetscFunctionReturn(PETSC_SUCCESS);
1398: }
1400: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (CUDA specialisation) */
1401: struct MatSeqAIJCUSPARSE_Policy {
1402: typedef Mat_SeqAIJCUSPARSE mat_struct_type;
1403: typedef Mat_SeqAIJCUSPARSEMultStruct mult_struct_type;
1405: static int storage_format_csr() { return (int)MAT_CUSPARSE_CSR; }
1406: static int storage_format_ell() { return (int)MAT_CUSPARSE_ELL; }
1407: static int storage_format_hyb() { return (int)MAT_CUSPARSE_HYB; }
1409: static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
1410: static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJCUSPARSECopyFromGPU(A); }
1411: static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJCUSPARSEInvalidateTranspose(A, d); }
1412: static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, t, r, C); }
1413: static const char *mat_type_name;
1415: static PetscErrorCode Destroy(Mat A) { return MatSeqAIJCUSPARSE_Destroy(A); }
1416: static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)spptr); }
1417: static const char *set_format_c;
1418: static const char *set_use_cpu_solve_c;
1419: static const char *product_seqdense_device_c;
1420: static const char *product_seqdense_c;
1421: static const char *product_self_c;
1422: static const char *seq_convert_hypre_c;
1424: static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecCUDAGetArrayRead(v, a); }
1425: static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecCUDARestoreArrayRead(v, a); }
1426: static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecCUDAGetArrayWrite(v, a); }
1427: static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecCUDARestoreArrayWrite(v, a); }
1428: };
1429: const char *MatSeqAIJCUSPARSE_Policy::mat_type_name = MATSEQAIJCUSPARSE;
1430: const char *MatSeqAIJCUSPARSE_Policy::set_format_c = "MatCUSPARSESetFormat_C";
1431: const char *MatSeqAIJCUSPARSE_Policy::set_use_cpu_solve_c = "MatCUSPARSESetUseCPUSolve_C";
1432: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C";
1433: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_c = "MatProductSetFromOptions_seqaijcusparse_seqdense_C";
1434: const char *MatSeqAIJCUSPARSE_Policy::product_self_c = "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C";
1435: const char *MatSeqAIJCUSPARSE_Policy::seq_convert_hypre_c = "MatConvert_seqaijcusparse_hypre_C";
1437: using MatSeqAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatSeqAIJCUSPARSE_Policy>;
1439: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1440: {
1441: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1442: }
1444: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1445: {
1446: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1447: }
1449: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1450: {
1451: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1452: }
1454: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1455: {
1456: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1457: }
1459: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1460: {
1461: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1462: }
1464: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1465: {
1466: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1467: }
1469: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1470: {
1471: Mat_SeqAIJCUSPARSE *cusp;
1472: CsrMatrix *matrix;
1474: PetscFunctionBegin;
1475: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1476: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1477: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1478: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1479: matrix = (CsrMatrix *)cusp->mat->mat;
1481: if (i) {
1482: #if !defined(PETSC_USE_64BIT_INDICES)
1483: *i = matrix->row_offsets->data().get();
1484: #else
1485: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
1486: #endif
1487: }
1488: if (j) {
1489: #if !defined(PETSC_USE_64BIT_INDICES)
1490: *j = matrix->column_indices->data().get();
1491: #else
1492: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
1493: #endif
1494: }
1495: if (a) *a = matrix->values->data().get();
1496: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1497: PetscFunctionReturn(PETSC_SUCCESS);
1498: }
1500: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1501: {
1502: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1503: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1504: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1505: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
1506: cusparseStatus_t stat;
1507: PetscBool both = PETSC_TRUE;
1509: PetscFunctionBegin;
1510: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1511: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1512: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1513: CsrMatrix *matrix;
1514: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
1516: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1517: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1518: matrix->values->assign(a->a, a->a + a->nz);
1519: PetscCallCUDA(WaitForCUDA());
1520: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1521: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1522: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
1523: } else {
1524: PetscInt nnz;
1525: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1526: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
1527: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1528: delete cusparsestruct->workVector;
1529: delete cusparsestruct->rowoffsets_gpu;
1530: cusparsestruct->workVector = NULL;
1531: cusparsestruct->rowoffsets_gpu = NULL;
1532: try {
1533: if (a->compressedrow.use) {
1534: m = a->compressedrow.nrows;
1535: ii = a->compressedrow.i;
1536: ridx = a->compressedrow.rindex;
1537: } else {
1538: m = A->rmap->n;
1539: ii = a->i;
1540: ridx = NULL;
1541: }
1542: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1543: if (!a->a) {
1544: nnz = ii[m];
1545: both = PETSC_FALSE;
1546: } else nnz = a->nz;
1547: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
1549: /* create cusparse matrix */
1550: cusparsestruct->nrows = m;
1551: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1552: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1553: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1554: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1556: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1557: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1558: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1559: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1560: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1561: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1562: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1564: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1565: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1566: /* set the matrix */
1567: CsrMatrix *mat = new CsrMatrix;
1568: mat->num_rows = m;
1569: mat->num_cols = A->cmap->n;
1570: mat->num_entries = nnz;
1571: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
1572: mat->row_offsets->assign(ii, ii + m + 1);
1573: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
1574: mat->column_indices->assign(a->j, a->j + nnz);
1576: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1577: if (a->a) mat->values->assign(a->a, a->a + nnz);
1579: /* assign the pointer */
1580: matstruct->mat = mat;
1581: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1582: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1583: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1584: PetscCallCUSPARSE(stat);
1585: }
1586: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1587: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1588: }
1590: /* assign the compressed row indices */
1591: if (a->compressedrow.use) {
1592: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
1593: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1594: matstruct->cprowIndices->assign(ridx, ridx + m);
1595: tmp = m;
1596: } else {
1597: cusparsestruct->workVector = NULL;
1598: matstruct->cprowIndices = NULL;
1599: tmp = 0;
1600: }
1601: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
1603: /* assign the pointer */
1604: cusparsestruct->mat = matstruct;
1605: } catch (char *ex) {
1606: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
1607: }
1608: PetscCallCUDA(WaitForCUDA());
1609: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1610: cusparsestruct->nonzerostate = A->nonzerostate;
1611: }
1612: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1613: }
1614: PetscFunctionReturn(PETSC_SUCCESS);
1615: }
1617: struct VecCUDAPlusEquals {
1618: template <typename Tuple>
1619: __host__ __device__ void operator()(Tuple t)
1620: {
1621: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1622: }
1623: };
1625: struct VecCUDAEquals {
1626: template <typename Tuple>
1627: __host__ __device__ void operator()(Tuple t)
1628: {
1629: thrust::get<1>(t) = thrust::get<0>(t);
1630: }
1631: };
1633: struct VecCUDAEqualsReverse {
1634: template <typename Tuple>
1635: __host__ __device__ void operator()(Tuple t)
1636: {
1637: thrust::get<0>(t) = thrust::get<1>(t);
1638: }
1639: };
1641: struct MatProductCtx_MatMatCusparse {
1642: PetscBool cisdense;
1643: PetscScalar *Bt;
1644: Mat X;
1645: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1646: PetscLogDouble flops;
1647: CsrMatrix *Bcsr;
1649: cusparseSpMatDescr_t matSpBDescr;
1650: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1651: cusparseDnMatDescr_t matBDescr;
1652: cusparseDnMatDescr_t matCDescr;
1653: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1654: void *dBuffer4;
1655: void *dBuffer5;
1656: size_t mmBufferSize;
1657: void *mmBuffer;
1658: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1659: cusparseSpGEMMDescr_t spgemmDesc;
1660: };
1662: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
1663: {
1664: MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
1666: PetscFunctionBegin;
1667: PetscCallCUDA(cudaFree(mmdata->Bt));
1668: delete mmdata->Bcsr;
1669: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
1670: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1671: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1672: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1673: PetscCallCUDA(cudaFree(mmdata->dBuffer4));
1674: PetscCallCUDA(cudaFree(mmdata->dBuffer5));
1675: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1676: PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
1677: PetscCall(MatDestroy(&mmdata->X));
1678: PetscCall(PetscFree(mmdata));
1679: PetscFunctionReturn(PETSC_SUCCESS);
1680: }
1682: #include <../src/mat/impls/dense/seq/dense.h>
1684: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1685: {
1686: Mat_Product *product = C->product;
1687: Mat A, B;
1688: PetscInt m, n, blda, clda;
1689: PetscBool flg, biscuda;
1690: Mat_SeqAIJCUSPARSE *cusp;
1691: cusparseStatus_t stat;
1692: cusparseOperation_t opA;
1693: const PetscScalar *barray;
1694: PetscScalar *carray;
1695: MatProductCtx_MatMatCusparse *mmdata;
1696: Mat_SeqAIJCUSPARSEMultStruct *mat;
1697: CsrMatrix *csrmat;
1699: PetscFunctionBegin;
1700: MatCheckProduct(C, 1);
1701: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1702: mmdata = (MatProductCtx_MatMatCusparse *)product->data;
1703: A = product->A;
1704: B = product->B;
1705: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1706: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1707: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1708: Instead of silently accepting the wrong answer, I prefer to raise the error */
1709: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1710: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1711: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1712: switch (product->type) {
1713: case MATPRODUCT_AB:
1714: case MATPRODUCT_PtAP:
1715: mat = cusp->mat;
1716: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1717: m = A->rmap->n;
1718: n = B->cmap->n;
1719: break;
1720: case MATPRODUCT_AtB:
1721: if (!A->form_explicit_transpose) {
1722: mat = cusp->mat;
1723: opA = CUSPARSE_OPERATION_TRANSPOSE;
1724: } else {
1725: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1726: mat = cusp->matTranspose;
1727: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1728: }
1729: m = A->cmap->n;
1730: n = B->cmap->n;
1731: break;
1732: case MATPRODUCT_ABt:
1733: case MATPRODUCT_RARt:
1734: mat = cusp->mat;
1735: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1736: m = A->rmap->n;
1737: n = B->rmap->n;
1738: break;
1739: default:
1740: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1741: }
1742: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
1743: csrmat = (CsrMatrix *)mat->mat;
1744: /* if the user passed a CPU matrix, copy the data to the GPU */
1745: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
1746: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1747: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
1749: PetscCall(MatDenseGetLDA(B, &blda));
1750: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1751: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1752: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1753: } else {
1754: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1755: PetscCall(MatDenseGetLDA(C, &clda));
1756: }
1758: PetscCall(PetscLogGpuTimeBegin());
1759: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1760: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
1761: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
1762: #else
1763: cusparseSpMatDescr_t &matADescr = mat->matDescr;
1764: #endif
1766: /* (re)allocate mmBuffer if not initialized or LDAs are different */
1767: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1768: size_t mmBufferSize;
1769: if (mmdata->initialized && mmdata->Blda != blda) {
1770: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1771: mmdata->matBDescr = NULL;
1772: }
1773: if (!mmdata->matBDescr) {
1774: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1775: mmdata->Blda = blda;
1776: }
1778: if (mmdata->initialized && mmdata->Clda != clda) {
1779: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1780: mmdata->matCDescr = NULL;
1781: }
1782: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1783: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1784: mmdata->Clda = clda;
1785: }
1787: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
1788: if (matADescr) {
1789: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
1790: matADescr = NULL;
1791: }
1792: #endif
1794: if (!matADescr) {
1795: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1796: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1797: PetscCallCUSPARSE(stat);
1798: }
1800: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
1802: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1803: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1804: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
1805: mmdata->mmBufferSize = mmBufferSize;
1806: }
1808: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
1809: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1810: #endif
1812: mmdata->initialized = PETSC_TRUE;
1813: } else {
1814: /* to be safe, always update pointers of the mats */
1815: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
1816: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1817: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1818: }
1820: /* do cusparseSpMM, which supports transpose on B */
1821: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1823: PetscCall(PetscLogGpuTimeEnd());
1824: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1825: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1826: if (product->type == MATPRODUCT_RARt) {
1827: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1828: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1829: } else if (product->type == MATPRODUCT_PtAP) {
1830: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1831: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1832: } else {
1833: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1834: }
1835: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1836: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1837: PetscFunctionReturn(PETSC_SUCCESS);
1838: }
1840: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1841: {
1842: Mat_Product *product = C->product;
1843: Mat A, B;
1844: PetscInt m, n;
1845: PetscBool cisdense, flg;
1846: MatProductCtx_MatMatCusparse *mmdata;
1847: Mat_SeqAIJCUSPARSE *cusp;
1849: PetscFunctionBegin;
1850: MatCheckProduct(C, 1);
1851: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1852: A = product->A;
1853: B = product->B;
1854: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1855: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1856: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1857: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1858: switch (product->type) {
1859: case MATPRODUCT_AB:
1860: m = A->rmap->n;
1861: n = B->cmap->n;
1862: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1863: break;
1864: case MATPRODUCT_AtB:
1865: m = A->cmap->n;
1866: n = B->cmap->n;
1867: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
1868: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1869: break;
1870: case MATPRODUCT_ABt:
1871: m = A->rmap->n;
1872: n = B->rmap->n;
1873: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
1874: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1875: break;
1876: case MATPRODUCT_PtAP:
1877: m = B->cmap->n;
1878: n = B->cmap->n;
1879: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
1880: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1881: break;
1882: case MATPRODUCT_RARt:
1883: m = B->rmap->n;
1884: n = B->rmap->n;
1885: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
1886: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1887: break;
1888: default:
1889: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1890: }
1891: PetscCall(MatSetSizes(C, m, n, m, n));
1892: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1893: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1894: PetscCall(MatSetType(C, MATSEQDENSECUDA));
1896: /* product data */
1897: PetscCall(PetscNew(&mmdata));
1898: mmdata->cisdense = cisdense;
1899: /* for these products we need intermediate storage */
1900: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1901: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1902: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
1903: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1904: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1905: } else {
1906: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1907: }
1908: }
1909: C->product->data = mmdata;
1910: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
1912: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1913: PetscFunctionReturn(PETSC_SUCCESS);
1914: }
1916: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1917: {
1918: Mat_Product *product = C->product;
1919: Mat A, B;
1920: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
1921: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1922: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1923: CsrMatrix *Acsr, *Bcsr, *Ccsr;
1924: PetscBool flg;
1925: cusparseStatus_t stat;
1926: MatProductType ptype;
1927: MatProductCtx_MatMatCusparse *mmdata;
1928: cusparseSpMatDescr_t BmatSpDescr;
1929: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
1931: PetscFunctionBegin;
1932: MatCheckProduct(C, 1);
1933: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1934: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
1935: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1936: mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
1937: A = product->A;
1938: B = product->B;
1939: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1940: mmdata->reusesym = PETSC_FALSE;
1941: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1942: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1943: Cmat = Ccusp->mat;
1944: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1945: Ccsr = (CsrMatrix *)Cmat->mat;
1946: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1947: goto finalize;
1948: }
1949: if (!c->nz) goto finalize;
1950: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1952: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
1953: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1954: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1955: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1956: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1957: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
1958: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1959: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1960: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1961: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1962: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1963: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
1965: ptype = product->type;
1966: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1967: ptype = MATPRODUCT_AB;
1968: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1969: }
1970: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1971: ptype = MATPRODUCT_AB;
1972: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1973: }
1974: switch (ptype) {
1975: case MATPRODUCT_AB:
1976: Amat = Acusp->mat;
1977: Bmat = Bcusp->mat;
1978: break;
1979: case MATPRODUCT_AtB:
1980: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1981: Amat = Acusp->matTranspose;
1982: Bmat = Bcusp->mat;
1983: break;
1984: case MATPRODUCT_ABt:
1985: Amat = Acusp->mat;
1986: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
1987: Bmat = Bcusp->matTranspose;
1988: break;
1989: default:
1990: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1991: }
1992: Cmat = Ccusp->mat;
1993: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1994: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1995: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1996: Acsr = (CsrMatrix *)Amat->mat;
1997: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1998: Ccsr = (CsrMatrix *)Cmat->mat;
1999: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2000: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2001: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2002: PetscCall(PetscLogGpuTimeBegin());
2003: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2004: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2005: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2006: PetscCallCUSPARSE(stat);
2007: PetscCall(PetscLogGpuFlops(mmdata->flops));
2008: PetscCallCUDA(WaitForCUDA());
2009: PetscCall(PetscLogGpuTimeEnd());
2010: C->offloadmask = PETSC_OFFLOAD_GPU;
2011: finalize:
2012: /* shorter version of MatAssemblyEnd_SeqAIJ */
2013: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
2014: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
2015: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2016: c->reallocs = 0;
2017: C->info.mallocs += 0;
2018: C->info.nz_unneeded = 0;
2019: C->assembled = C->was_assembled = PETSC_TRUE;
2020: C->num_ass++;
2021: PetscFunctionReturn(PETSC_SUCCESS);
2022: }
2024: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2025: {
2026: Mat_Product *product = C->product;
2027: Mat A, B;
2028: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2029: Mat_SeqAIJ *a, *b, *c;
2030: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2031: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2032: PetscInt i, j, m, n, k;
2033: PetscBool flg;
2034: cusparseStatus_t stat;
2035: MatProductType ptype;
2036: MatProductCtx_MatMatCusparse *mmdata;
2037: PetscLogDouble flops;
2038: PetscBool biscompressed, ciscompressed;
2039: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2040: cusparseSpMatDescr_t BmatSpDescr;
2041: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2042: void *dBuffer1 = NULL, *dBuffer2 = NULL, *dBuffer3 = NULL;
2043: size_t bufferSize1 = 0, bufferSize2 = 0, bufferSize3 = 0, bufferSize4 = 0, bufferSize5 = 0;
2045: PetscFunctionBegin;
2046: MatCheckProduct(C, 1);
2047: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2048: A = product->A;
2049: B = product->B;
2050: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2051: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2052: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2053: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2054: a = (Mat_SeqAIJ *)A->data;
2055: b = (Mat_SeqAIJ *)B->data;
2056: /* product data */
2057: PetscCall(PetscNew(&mmdata));
2058: C->product->data = mmdata;
2059: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2061: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2062: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2063: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2064: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2065: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2066: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2068: ptype = product->type;
2069: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2070: ptype = MATPRODUCT_AB;
2071: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2072: }
2073: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2074: ptype = MATPRODUCT_AB;
2075: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2076: }
2077: biscompressed = PETSC_FALSE;
2078: ciscompressed = PETSC_FALSE;
2079: switch (ptype) {
2080: case MATPRODUCT_AB:
2081: m = A->rmap->n;
2082: n = B->cmap->n;
2083: k = A->cmap->n;
2084: Amat = Acusp->mat;
2085: Bmat = Bcusp->mat;
2086: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2087: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2088: break;
2089: case MATPRODUCT_AtB:
2090: m = A->cmap->n;
2091: n = B->cmap->n;
2092: k = A->rmap->n;
2093: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2094: Amat = Acusp->matTranspose;
2095: Bmat = Bcusp->mat;
2096: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2097: break;
2098: case MATPRODUCT_ABt:
2099: m = A->rmap->n;
2100: n = B->rmap->n;
2101: k = A->cmap->n;
2102: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2103: Amat = Acusp->mat;
2104: Bmat = Bcusp->matTranspose;
2105: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2106: break;
2107: default:
2108: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2109: }
2111: /* create cusparse matrix */
2112: PetscCall(MatSetSizes(C, m, n, m, n));
2113: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2114: c = (Mat_SeqAIJ *)C->data;
2115: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2116: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2117: Ccsr = new CsrMatrix;
2119: c->compressedrow.use = ciscompressed;
2120: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2121: c->compressedrow.nrows = a->compressedrow.nrows;
2122: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2123: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2124: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2125: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2126: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2127: } else {
2128: c->compressedrow.nrows = 0;
2129: c->compressedrow.i = NULL;
2130: c->compressedrow.rindex = NULL;
2131: Ccusp->workVector = NULL;
2132: Cmat->cprowIndices = NULL;
2133: }
2134: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2135: Ccusp->mat = Cmat;
2136: Ccusp->mat->mat = Ccsr;
2137: Ccsr->num_rows = Ccusp->nrows;
2138: Ccsr->num_cols = n;
2139: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2140: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2141: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2142: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2143: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2144: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2145: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2146: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2147: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2148: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2149: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2150: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2151: c->nz = 0;
2152: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2153: Ccsr->values = new THRUSTARRAY(c->nz);
2154: goto finalizesym;
2155: }
2157: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2158: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2159: Acsr = (CsrMatrix *)Amat->mat;
2160: if (!biscompressed) {
2161: Bcsr = (CsrMatrix *)Bmat->mat;
2162: BmatSpDescr = Bmat->matDescr;
2163: } else { /* we need to use row offsets for the full matrix */
2164: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2165: Bcsr = new CsrMatrix;
2166: Bcsr->num_rows = B->rmap->n;
2167: Bcsr->num_cols = cBcsr->num_cols;
2168: Bcsr->num_entries = cBcsr->num_entries;
2169: Bcsr->column_indices = cBcsr->column_indices;
2170: Bcsr->values = cBcsr->values;
2171: if (!Bcusp->rowoffsets_gpu) {
2172: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2173: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2174: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2175: }
2176: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2177: mmdata->Bcsr = Bcsr;
2178: if (Bcsr->num_rows && Bcsr->num_cols) {
2179: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2180: PetscCallCUSPARSE(stat);
2181: }
2182: BmatSpDescr = mmdata->matSpBDescr;
2183: }
2184: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2185: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2186: /* precompute flops count */
2187: if (ptype == MATPRODUCT_AB) {
2188: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2189: const PetscInt st = a->i[i];
2190: const PetscInt en = a->i[i + 1];
2191: for (j = st; j < en; j++) {
2192: const PetscInt brow = a->j[j];
2193: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2194: }
2195: }
2196: } else if (ptype == MATPRODUCT_AtB) {
2197: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2198: const PetscInt anzi = a->i[i + 1] - a->i[i];
2199: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2200: flops += (2. * anzi) * bnzi;
2201: }
2202: } else { /* TODO */
2203: flops = 0.;
2204: }
2206: mmdata->flops = flops;
2207: PetscCall(PetscLogGpuTimeBegin());
2209: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2210: // cuda-12.2 requires non-null csrRowOffsets
2211: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2212: PetscCallCUSPARSE(stat);
2213: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2214: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2215: dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata
2216: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2217: */
2218: /* ask bufferSize1 bytes for external memory */
2219: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2220: PetscCallCUSPARSE(stat);
2221: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2222: /* inspect the matrices A and B to understand the memory requirement for the next step */
2223: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2224: PetscCallCUSPARSE(stat);
2226: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2227: PetscCallCUSPARSE(stat);
2228: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
2229: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
2230: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
2231: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2232: PetscCallCUSPARSE(stat);
2233: PetscCallCUDA(cudaFree(dBuffer1));
2234: PetscCallCUDA(cudaFree(dBuffer2));
2236: /* get matrix C non-zero entries C_nnz1 */
2237: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2238: c->nz = (PetscInt)C_nnz1;
2239: /* allocate matrix C */
2240: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2241: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2242: Ccsr->values = new THRUSTARRAY(c->nz);
2243: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2244: /* update matC with the new pointers */
2245: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2246: PetscCallCUSPARSE(stat);
2248: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2249: PetscCallCUSPARSE(stat);
2250: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
2251: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2252: PetscCallCUSPARSE(stat);
2253: PetscCallCUDA(cudaFree(dBuffer3));
2254: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2255: PetscCallCUSPARSE(stat);
2256: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2257: PetscCall(PetscLogGpuFlops(mmdata->flops));
2258: PetscCall(PetscLogGpuTimeEnd());
2259: finalizesym:
2260: c->free_a = PETSC_TRUE;
2261: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2262: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2263: c->free_ij = PETSC_TRUE;
2264: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
2265: PetscInt *d_i = c->i;
2266: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2267: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2268: ii = *Ccsr->row_offsets;
2269: jj = *Ccsr->column_indices;
2270: if (ciscompressed) d_i = c->compressedrow.i;
2271: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2272: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2273: } else {
2274: PetscInt *d_i = c->i;
2275: if (ciscompressed) d_i = c->compressedrow.i;
2276: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2277: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2278: }
2279: if (ciscompressed) { /* need to expand host row offsets */
2280: PetscInt r = 0;
2281: c->i[0] = 0;
2282: for (k = 0; k < c->compressedrow.nrows; k++) {
2283: const PetscInt next = c->compressedrow.rindex[k];
2284: const PetscInt old = c->compressedrow.i[k];
2285: for (; r < next; r++) c->i[r + 1] = old;
2286: }
2287: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2288: }
2289: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2290: PetscCall(PetscMalloc1(m, &c->ilen));
2291: PetscCall(PetscMalloc1(m, &c->imax));
2292: c->maxnz = c->nz;
2293: c->nonzerorowcnt = 0;
2294: c->rmax = 0;
2295: for (k = 0; k < m; k++) {
2296: const PetscInt nn = c->i[k + 1] - c->i[k];
2297: c->ilen[k] = c->imax[k] = nn;
2298: c->nonzerorowcnt += (PetscInt)!!nn;
2299: c->rmax = PetscMax(c->rmax, nn);
2300: }
2301: PetscCall(PetscMalloc1(c->nz, &c->a));
2302: Ccsr->num_entries = c->nz;
2304: C->nonzerostate++;
2305: PetscCall(PetscLayoutSetUp(C->rmap));
2306: PetscCall(PetscLayoutSetUp(C->cmap));
2307: Ccusp->nonzerostate = C->nonzerostate;
2308: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2309: C->preallocated = PETSC_TRUE;
2310: C->assembled = PETSC_FALSE;
2311: C->was_assembled = PETSC_FALSE;
2312: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2313: mmdata->reusesym = PETSC_TRUE;
2314: C->offloadmask = PETSC_OFFLOAD_GPU;
2315: }
2316: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2317: PetscFunctionReturn(PETSC_SUCCESS);
2318: }
2320: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2322: /* handles sparse or dense B */
2323: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2324: {
2325: Mat_Product *product = mat->product;
2326: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2328: PetscFunctionBegin;
2329: MatCheckProduct(mat, 1);
2330: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2331: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
2332: if (product->type == MATPRODUCT_ABC) {
2333: Ciscusp = PETSC_FALSE;
2334: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
2335: }
2336: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2337: PetscBool usecpu = PETSC_FALSE;
2338: switch (product->type) {
2339: case MATPRODUCT_AB:
2340: if (product->api_user) {
2341: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2342: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2343: PetscOptionsEnd();
2344: } else {
2345: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2346: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2347: PetscOptionsEnd();
2348: }
2349: break;
2350: case MATPRODUCT_AtB:
2351: if (product->api_user) {
2352: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2353: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2354: PetscOptionsEnd();
2355: } else {
2356: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2357: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2358: PetscOptionsEnd();
2359: }
2360: break;
2361: case MATPRODUCT_PtAP:
2362: if (product->api_user) {
2363: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2364: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2365: PetscOptionsEnd();
2366: } else {
2367: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2368: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2369: PetscOptionsEnd();
2370: }
2371: break;
2372: case MATPRODUCT_RARt:
2373: if (product->api_user) {
2374: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2375: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2376: PetscOptionsEnd();
2377: } else {
2378: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2379: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2380: PetscOptionsEnd();
2381: }
2382: break;
2383: case MATPRODUCT_ABC:
2384: if (product->api_user) {
2385: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2386: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2387: PetscOptionsEnd();
2388: } else {
2389: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2390: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2391: PetscOptionsEnd();
2392: }
2393: break;
2394: default:
2395: break;
2396: }
2397: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2398: }
2399: /* dispatch */
2400: if (isdense) {
2401: switch (product->type) {
2402: case MATPRODUCT_AB:
2403: case MATPRODUCT_AtB:
2404: case MATPRODUCT_ABt:
2405: case MATPRODUCT_PtAP:
2406: case MATPRODUCT_RARt:
2407: if (product->A->boundtocpu) {
2408: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2409: } else {
2410: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2411: }
2412: break;
2413: case MATPRODUCT_ABC:
2414: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2415: break;
2416: default:
2417: break;
2418: }
2419: } else if (Biscusp && Ciscusp) {
2420: switch (product->type) {
2421: case MATPRODUCT_AB:
2422: case MATPRODUCT_AtB:
2423: case MATPRODUCT_ABt:
2424: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2425: break;
2426: case MATPRODUCT_PtAP:
2427: case MATPRODUCT_RARt:
2428: case MATPRODUCT_ABC:
2429: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2430: break;
2431: default:
2432: break;
2433: }
2434: } else { /* fallback for AIJ */
2435: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2436: }
2437: PetscFunctionReturn(PETSC_SUCCESS);
2438: }
2440: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2441: {
2442: PetscFunctionBegin;
2443: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2444: PetscFunctionReturn(PETSC_SUCCESS);
2445: }
2447: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2448: {
2449: PetscFunctionBegin;
2450: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2451: PetscFunctionReturn(PETSC_SUCCESS);
2452: }
2454: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2455: {
2456: PetscFunctionBegin;
2457: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2458: PetscFunctionReturn(PETSC_SUCCESS);
2459: }
2461: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2462: {
2463: PetscFunctionBegin;
2464: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2465: PetscFunctionReturn(PETSC_SUCCESS);
2466: }
2468: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2469: {
2470: PetscFunctionBegin;
2471: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2472: PetscFunctionReturn(PETSC_SUCCESS);
2473: }
2475: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2476: {
2477: int i = blockIdx.x * blockDim.x + threadIdx.x;
2478: if (i < n) y[idx[i]] += x[i];
2479: }
2481: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2482: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2483: {
2484: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2485: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2486: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2487: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
2488: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2489: PetscBool compressed;
2490: PetscInt nx, ny;
2492: PetscFunctionBegin;
2493: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2494: if (!a->nz) {
2495: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
2496: else PetscCall(VecSeq_CUDA::Set(zz, 0));
2497: PetscFunctionReturn(PETSC_SUCCESS);
2498: }
2499: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2500: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2501: if (!trans) {
2502: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2503: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2504: } else {
2505: if (herm || !A->form_explicit_transpose) {
2506: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2507: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2508: } else {
2509: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2510: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
2511: }
2512: }
2513: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2514: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2516: try {
2517: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
2518: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2519: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
2521: PetscCall(PetscLogGpuTimeBegin());
2522: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2523: /* z = A x + beta y.
2524: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2525: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2526: */
2527: xptr = xarray;
2528: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2529: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2530: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2531: allocated to accommodate different uses. So we get the length info directly from mat.
2532: */
2533: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2534: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2535: nx = mat->num_cols; // since y = Ax
2536: ny = mat->num_rows;
2537: }
2538: } else {
2539: /* z = A^T x + beta y
2540: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2541: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2542: */
2543: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2544: dptr = zarray;
2545: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2546: if (compressed) { /* Scatter x to work vector */
2547: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2549: thrust::for_each(
2550: #if PetscDefined(HAVE_THRUST_ASYNC)
2551: thrust::cuda::par.on(PetscDefaultCudaStream),
2552: #endif
2553: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2554: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
2555: }
2556: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2557: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2558: nx = mat->num_rows; // since y = A^T x
2559: ny = mat->num_cols;
2560: }
2561: }
2563: /* csr_spmv does y = alpha op(A) x + beta y */
2564: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2565: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2566: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
2567: #else
2568: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
2569: #endif
2571: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2572: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2573: if (!matDescr) {
2574: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2575: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2576: }
2577: #endif
2579: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2580: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
2581: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
2582: PetscCallCUSPARSE(
2583: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
2584: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
2585: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
2586: PetscCallCUSPARSE(
2587: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2588: #endif
2589: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2590: } else {
2591: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2592: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
2593: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
2594: }
2596: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2598: } else {
2599: if (cusparsestruct->nrows) {
2600: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2601: }
2602: }
2603: PetscCall(PetscLogGpuTimeEnd());
2605: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2606: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2607: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2608: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
2609: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2610: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2611: }
2612: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2613: PetscCall(VecSeq_CUDA::Set(zz, 0));
2614: }
2616: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2617: if (compressed) {
2618: PetscCall(PetscLogGpuTimeBegin());
2619: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
2620: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
2621: PetscCall(PetscLogGpuTimeEnd());
2622: }
2623: } else {
2624: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2625: }
2626: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
2627: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
2628: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
2629: } catch (char *ex) {
2630: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2631: }
2632: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2633: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2634: PetscFunctionReturn(PETSC_SUCCESS);
2635: }
2637: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2638: {
2639: PetscFunctionBegin;
2640: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2641: PetscFunctionReturn(PETSC_SUCCESS);
2642: }
2644: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
2645: {
2646: PetscFunctionBegin;
2647: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetDiagonal(A, diag));
2648: PetscFunctionReturn(PETSC_SUCCESS);
2649: }
2651: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec ll, Vec rr)
2652: {
2653: PetscFunctionBegin;
2654: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2655: PetscFunctionReturn(PETSC_SUCCESS);
2656: }
2658: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
2659: {
2660: PetscFunctionBegin;
2661: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode));
2662: PetscFunctionReturn(PETSC_SUCCESS);
2663: }
2665: /*@
2666: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
2668: Collective
2670: Input Parameters:
2671: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2672: . m - number of rows
2673: . n - number of columns
2674: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
2675: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
2677: Output Parameter:
2678: . A - the matrix
2680: Level: intermediate
2682: Notes:
2683: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
2684: calculations. For good matrix assembly performance the user should preallocate the matrix
2685: storage by setting the parameter `nz` (or the array `nnz`).
2687: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2688: MatXXXXSetPreallocation() paradgm instead of this routine directly.
2689: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
2691: The AIJ format, also called
2692: compressed row storage, is fully compatible with standard Fortran
2693: storage. That is, the stored row and column indices can begin at
2694: either one (as in Fortran) or zero.
2696: Specify the preallocated storage with either nz or nnz (not both).
2697: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2698: allocation.
2700: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
2702: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
2703: `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
2704: @*/
2705: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2706: {
2707: return MatSeqAIJCUSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2708: }
2710: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2711: {
2712: return MatSeqAIJCUSPARSE_CUPM_t::Destroy(A);
2713: }
2715: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2716: {
2717: PetscFunctionBegin;
2718: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2719: PetscFunctionReturn(PETSC_SUCCESS);
2720: }
2722: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2723: {
2724: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2725: Mat_SeqAIJCUSPARSE *cy;
2726: Mat_SeqAIJCUSPARSE *cx;
2727: PetscScalar *ay;
2728: const PetscScalar *ax;
2729: CsrMatrix *csry, *csrx;
2731: PetscFunctionBegin;
2732: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
2733: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
2734: if (X->ops->axpy != Y->ops->axpy) {
2735: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2736: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2737: PetscFunctionReturn(PETSC_SUCCESS);
2738: }
2739: /* if we are here, it means both matrices are bound to GPU */
2740: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
2741: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
2742: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2743: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2744: csry = (CsrMatrix *)cy->mat->mat;
2745: csrx = (CsrMatrix *)cx->mat->mat;
2746: /* see if we can turn this into a cublas axpy */
2747: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2748: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2749: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2750: if (eq) str = SAME_NONZERO_PATTERN;
2751: }
2752: /* spgeam is buggy with one column */
2753: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
2755: if (str == SUBSET_NONZERO_PATTERN) {
2756: PetscScalar b = 1.0;
2757: size_t bufferSize;
2758: void *buffer;
2760: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
2761: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
2762: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
2763: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2764: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2765: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
2766: PetscCall(PetscLogGpuTimeBegin());
2767: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2768: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2769: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2770: PetscCall(PetscLogGpuTimeEnd());
2771: PetscCallCUDA(cudaFree(buffer));
2773: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
2774: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
2775: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
2776: } else if (str == SAME_NONZERO_PATTERN) {
2777: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2778: } else {
2779: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2780: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2781: }
2782: PetscFunctionReturn(PETSC_SUCCESS);
2783: }
2785: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
2786: {
2787: PetscFunctionBegin;
2788: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Scale(Y, a));
2789: PetscFunctionReturn(PETSC_SUCCESS);
2790: }
2792: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2793: {
2794: PetscFunctionBegin;
2795: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::ZeroEntries(A));
2796: PetscFunctionReturn(PETSC_SUCCESS);
2797: }
2799: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(Mat A, PetscMemType *m)
2800: {
2801: PetscFunctionBegin;
2802: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetCurrentMemType(A, m));
2803: PetscFunctionReturn(PETSC_SUCCESS);
2804: }
2806: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
2807: {
2808: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2810: PetscFunctionBegin;
2811: if (A->factortype != MAT_FACTOR_NONE) {
2812: A->boundtocpu = flg;
2813: PetscFunctionReturn(PETSC_SUCCESS);
2814: }
2815: if (flg) {
2816: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2818: A->ops->scale = MatScale_SeqAIJ;
2819: A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
2820: A->ops->diagonalscale = MatDiagonalScale_SeqAIJ;
2821: A->ops->axpy = MatAXPY_SeqAIJ;
2822: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
2823: A->ops->mult = MatMult_SeqAIJ;
2824: A->ops->multadd = MatMultAdd_SeqAIJ;
2825: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2826: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2827: A->ops->multhermitiantranspose = NULL;
2828: A->ops->multhermitiantransposeadd = NULL;
2829: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
2830: A->ops->getcurrentmemtype = NULL;
2831: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2832: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2833: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
2834: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
2835: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2836: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2837: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
2838: } else {
2839: A->ops->scale = MatScale_SeqAIJCUSPARSE;
2840: A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE;
2841: A->ops->diagonalscale = MatDiagonalScale_SeqAIJCUSPARSE;
2842: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
2843: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
2844: A->ops->mult = MatMult_SeqAIJCUSPARSE;
2845: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
2846: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
2847: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2848: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2849: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2850: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
2851: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
2852: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
2853: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
2854: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
2855: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
2856: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
2857: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
2858: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
2860: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
2861: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2862: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2863: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
2864: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
2865: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2866: }
2867: A->boundtocpu = flg;
2868: if (flg && a->inode.size_csr) {
2869: a->inode.use = PETSC_TRUE;
2870: } else {
2871: a->inode.use = PETSC_FALSE;
2872: }
2873: PetscFunctionReturn(PETSC_SUCCESS);
2874: }
2876: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
2877: {
2878: Mat B;
2880: PetscFunctionBegin;
2881: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
2882: if (reuse == MAT_INITIAL_MATRIX) {
2883: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2884: } else if (reuse == MAT_REUSE_MATRIX) {
2885: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2886: }
2887: B = *newmat;
2889: PetscCall(PetscFree(B->defaultvectype));
2890: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
2892: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2893: if (B->factortype == MAT_FACTOR_NONE) {
2894: Mat_SeqAIJCUSPARSE *spptr;
2895: PetscCall(PetscNew(&spptr));
2896: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2897: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2898: spptr->format = MAT_CUSPARSE_CSR;
2899: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
2900: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2901: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
2902: B->spptr = spptr;
2903: } else {
2904: Mat_SeqAIJCUSPARSETriFactors *spptr;
2906: PetscCall(PetscNew(&spptr));
2907: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2908: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2909: B->spptr = spptr;
2910: }
2911: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2912: }
2913: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2914: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2915: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
2916: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2917: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2918: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2919: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
2921: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
2922: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
2923: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
2924: #if defined(PETSC_HAVE_HYPRE)
2925: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
2926: #endif
2927: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
2928: PetscFunctionReturn(PETSC_SUCCESS);
2929: }
2931: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2932: {
2933: PetscFunctionBegin;
2934: PetscCall(MatCreate_SeqAIJ(B));
2935: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
2936: PetscFunctionReturn(PETSC_SUCCESS);
2937: }
2939: /*MC
2940: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
2942: Options Database Keys:
2943: + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
2944: . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2945: Other options include ell (ellpack) or hyb (hybrid).
2946: . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2947: - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
2949: Level: beginner
2951: Notes:
2952: These matrices can be in either CSR, ELL, or HYB format.
2954: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
2956: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
2957: if some integer values passed in do not fit in `int`.
2959: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2960: M*/
2962: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2963: {
2964: PetscFunctionBegin;
2965: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
2966: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
2967: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
2968: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
2969: PetscFunctionReturn(PETSC_SUCCESS);
2970: }
2972: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
2973: {
2974: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
2976: PetscFunctionBegin;
2977: if (cusp) {
2978: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2979: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2980: delete cusp->workVector;
2981: delete cusp->rowoffsets_gpu;
2982: delete cusp->csr2csc_i;
2983: delete cusp->coords;
2984: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
2985: PetscCall(PetscFree(mat->spptr));
2986: }
2987: PetscFunctionReturn(PETSC_SUCCESS);
2988: }
2990: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2991: {
2992: PetscFunctionBegin;
2993: if (*mat) {
2994: delete (*mat)->values;
2995: delete (*mat)->column_indices;
2996: delete (*mat)->row_offsets;
2997: delete *mat;
2998: *mat = 0;
2999: }
3000: PetscFunctionReturn(PETSC_SUCCESS);
3001: }
3003: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3004: {
3005: CsrMatrix *mat;
3007: PetscFunctionBegin;
3008: if (*matstruct) {
3009: if ((*matstruct)->mat) {
3010: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3011: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3012: } else {
3013: mat = (CsrMatrix *)(*matstruct)->mat;
3014: PetscCall(CsrMatrix_Destroy(&mat));
3015: }
3016: }
3017: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3018: delete (*matstruct)->cprowIndices;
3019: PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3020: PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3021: PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3023: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3024: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3026: for (int i = 0; i < 3; i++) {
3027: if (mdata->cuSpMV[i].initialized) {
3028: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3029: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3030: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3031: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3032: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
3033: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
3034: #endif
3035: }
3036: }
3037: delete *matstruct;
3038: *matstruct = NULL;
3039: }
3040: PetscFunctionReturn(PETSC_SUCCESS);
3041: }
3043: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3044: {
3045: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3047: PetscFunctionBegin;
3048: if (fs) {
3049: delete fs->rpermIndices;
3050: delete fs->cpermIndices;
3051: fs->rpermIndices = NULL;
3052: fs->cpermIndices = NULL;
3053: fs->init_dev_prop = PETSC_FALSE;
3054: PetscCallCUDA(cudaFree(fs->csrRowPtr));
3055: PetscCallCUDA(cudaFree(fs->csrColIdx));
3056: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
3057: PetscCallCUDA(cudaFree(fs->csrColIdx32));
3058: PetscCallCUDA(cudaFree(fs->csrVal));
3059: PetscCallCUDA(cudaFree(fs->diag));
3060: PetscCallCUDA(cudaFree(fs->X));
3061: PetscCallCUDA(cudaFree(fs->Y));
3062: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3063: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3064: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3065: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3066: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3067: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3068: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3069: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3070: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3071: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3072: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3073: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3074: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3075: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3076: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3077: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3078: PetscCall(PetscFree(fs->csrRowPtr_h));
3079: PetscCall(PetscFree(fs->csrVal_h));
3080: PetscCall(PetscFree(fs->diag_h));
3081: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3082: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3083: }
3084: PetscFunctionReturn(PETSC_SUCCESS);
3085: }
3087: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3088: {
3089: PetscFunctionBegin;
3090: if (*trifactors) {
3091: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3092: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3093: PetscCall(PetscFree(*trifactors));
3094: }
3095: PetscFunctionReturn(PETSC_SUCCESS);
3096: }
3098: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3099: {
3100: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3102: PetscFunctionBegin;
3103: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3104: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3105: if (destroy) {
3106: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3107: delete cusp->csr2csc_i;
3108: cusp->csr2csc_i = NULL;
3109: }
3110: A->transupdated = PETSC_FALSE;
3111: PetscFunctionReturn(PETSC_SUCCESS);
3112: }
3114: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3115: {
3116: PetscFunctionBegin;
3117: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3118: PetscFunctionReturn(PETSC_SUCCESS);
3119: }
3121: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3122: {
3123: PetscFunctionBegin;
3124: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3125: PetscFunctionReturn(PETSC_SUCCESS);
3126: }
3128: /*@C
3129: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
3131: Not Collective
3133: Input Parameters:
3134: + A - the matrix
3135: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3137: Output Parameters:
3138: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
3139: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
3141: Level: developer
3143: Note:
3144: When compressed is true, the CSR structure does not contain empty rows
3146: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
3147: @*/
3148: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3149: {
3150: PetscFunctionBegin;
3151: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3152: PetscFunctionReturn(PETSC_SUCCESS);
3153: }
3155: /*@C
3156: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
3158: Not Collective
3160: Input Parameters:
3161: + A - the matrix
3162: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3163: . i - the CSR row pointers
3164: - j - the CSR column indices
3166: Level: developer
3168: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
3169: @*/
3170: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
3171: {
3172: PetscFunctionBegin;
3173: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3174: PetscFunctionReturn(PETSC_SUCCESS);
3175: }
3177: /*@C
3178: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
3180: Not Collective
3182: Input Parameter:
3183: . A - a `MATSEQAIJCUSPARSE` matrix
3185: Output Parameter:
3186: . a - pointer to the device data
3188: Level: developer
3190: Note:
3191: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
3193: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
3194: @*/
3195: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3196: {
3197: return MatSeqAIJCUSPARSE_CUPM_t::GetArrayRead(A, a);
3198: }
3200: /*@C
3201: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
3203: Not Collective
3205: Input Parameters:
3206: + A - a `MATSEQAIJCUSPARSE` matrix
3207: - a - pointer to the device data
3209: Level: developer
3211: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
3212: @*/
3213: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3214: {
3215: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayRead(A, a);
3216: }
3218: /*@C
3219: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
3221: Not Collective
3223: Input Parameter:
3224: . A - a `MATSEQAIJCUSPARSE` matrix
3226: Output Parameter:
3227: . a - pointer to the device data
3229: Level: developer
3231: Note:
3232: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
3234: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
3235: @*/
3236: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
3237: {
3238: return MatSeqAIJCUSPARSE_CUPM_t::GetArray(A, a);
3239: }
3240: /*@C
3241: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
3243: Not Collective
3245: Input Parameters:
3246: + A - a `MATSEQAIJCUSPARSE` matrix
3247: - a - pointer to the device data
3249: Level: developer
3251: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
3252: @*/
3253: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
3254: {
3255: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArray(A, a);
3256: }
3258: /*@C
3259: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
3261: Not Collective
3263: Input Parameter:
3264: . A - a `MATSEQAIJCUSPARSE` matrix
3266: Output Parameter:
3267: . a - pointer to the device data
3269: Level: developer
3271: Note:
3272: Does not trigger any host to device copies.
3274: It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
3276: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
3277: @*/
3278: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
3279: {
3280: return MatSeqAIJCUSPARSE_CUPM_t::GetArrayWrite(A, a);
3281: }
3283: /*@C
3284: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
3286: Not Collective
3288: Input Parameters:
3289: + A - a `MATSEQAIJCUSPARSE` matrix
3290: - a - pointer to the device data
3292: Level: developer
3294: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
3295: @*/
3296: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
3297: {
3298: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3299: }
3301: struct IJCompare4 {
3302: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3303: {
3304: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
3305: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
3306: return false;
3307: }
3308: };
3310: struct Shift {
3311: int _shift;
3313: Shift(int shift) : _shift(shift) { }
3314: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
3315: };
3317: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3318: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3319: {
3320: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3321: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
3322: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3323: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3324: PetscInt Annz, Bnnz;
3325: cusparseStatus_t stat;
3326: PetscInt i, m, n, zero = 0;
3328: PetscFunctionBegin;
3331: PetscAssertPointer(C, 4);
3332: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3333: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
3334: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3335: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3336: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3337: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3338: if (reuse == MAT_INITIAL_MATRIX) {
3339: m = A->rmap->n;
3340: n = A->cmap->n + B->cmap->n;
3341: PetscCall(MatCreate(PETSC_COMM_SELF, C));
3342: PetscCall(MatSetSizes(*C, m, n, m, n));
3343: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
3344: c = (Mat_SeqAIJ *)(*C)->data;
3345: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3346: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3347: Ccsr = new CsrMatrix;
3348: Cmat->cprowIndices = NULL;
3349: c->compressedrow.use = PETSC_FALSE;
3350: c->compressedrow.nrows = 0;
3351: c->compressedrow.i = NULL;
3352: c->compressedrow.rindex = NULL;
3353: Ccusp->workVector = NULL;
3354: Ccusp->nrows = m;
3355: Ccusp->mat = Cmat;
3356: Ccusp->mat->mat = Ccsr;
3357: Ccsr->num_rows = m;
3358: Ccsr->num_cols = n;
3359: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3360: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3361: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3362: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3363: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3364: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3365: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3366: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3367: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3368: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3369: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3370: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3371: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3373: Acsr = (CsrMatrix *)Acusp->mat->mat;
3374: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3375: Annz = (PetscInt)Acsr->column_indices->size();
3376: Bnnz = (PetscInt)Bcsr->column_indices->size();
3377: c->nz = Annz + Bnnz;
3378: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
3379: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3380: Ccsr->values = new THRUSTARRAY(c->nz);
3381: Ccsr->num_entries = c->nz;
3382: Ccusp->coords = new THRUSTINTARRAY(c->nz);
3383: if (c->nz) {
3384: auto Acoo = new THRUSTINTARRAY32(Annz);
3385: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3386: auto Ccoo = new THRUSTINTARRAY32(c->nz);
3387: THRUSTINTARRAY32 *Aroff, *Broff;
3389: if (a->compressedrow.use) { /* need full row offset */
3390: if (!Acusp->rowoffsets_gpu) {
3391: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3392: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3393: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3394: }
3395: Aroff = Acusp->rowoffsets_gpu;
3396: } else Aroff = Acsr->row_offsets;
3397: if (b->compressedrow.use) { /* need full row offset */
3398: if (!Bcusp->rowoffsets_gpu) {
3399: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3400: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3401: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3402: }
3403: Broff = Bcusp->rowoffsets_gpu;
3404: } else Broff = Bcsr->row_offsets;
3405: PetscCall(PetscLogGpuTimeBegin());
3406: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3407: PetscCallCUSPARSE(stat);
3408: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3409: PetscCallCUSPARSE(stat);
3410: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3411: auto Aperm = thrust::make_constant_iterator(1);
3412: auto Bperm = thrust::make_constant_iterator(0);
3413: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3414: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3415: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
3416: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3417: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3418: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
3419: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3420: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3421: auto p1 = Ccusp->coords->begin();
3422: auto p2 = Ccusp->coords->begin();
3423: #if CCCL_VERSION >= 3001000
3424: cuda::std::advance(p2, Annz);
3425: #else
3426: thrust::advance(p2, Annz);
3427: #endif
3428: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
3429: auto cci = thrust::make_counting_iterator(zero);
3430: auto cce = thrust::make_counting_iterator(c->nz);
3431: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
3432: auto pred = thrust::identity<int>();
3433: #else
3434: auto pred = cuda::std::identity();
3435: #endif
3436: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3437: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3438: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
3439: PetscCallCUSPARSE(stat);
3440: PetscCall(PetscLogGpuTimeEnd());
3441: delete wPerm;
3442: delete Acoo;
3443: delete Bcoo;
3444: delete Ccoo;
3445: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3446: PetscCallCUSPARSE(stat);
3447: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3448: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3449: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3450: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3451: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3452: CsrMatrix *CcsrT = new CsrMatrix;
3453: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3454: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3456: (*C)->form_explicit_transpose = PETSC_TRUE;
3457: (*C)->transupdated = PETSC_TRUE;
3458: Ccusp->rowoffsets_gpu = NULL;
3459: CmatT->cprowIndices = NULL;
3460: CmatT->mat = CcsrT;
3461: CcsrT->num_rows = n;
3462: CcsrT->num_cols = m;
3463: CcsrT->num_entries = c->nz;
3465: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
3466: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3467: CcsrT->values = new THRUSTARRAY(c->nz);
3469: PetscCall(PetscLogGpuTimeBegin());
3470: auto rT = CcsrT->row_offsets->begin();
3471: if (AT) {
3472: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3473: #if CCCL_VERSION >= 3001000
3474: cuda::std::advance(rT, -1);
3475: #else
3476: thrust::advance(rT, -1);
3477: #endif
3478: }
3479: if (BT) {
3480: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3481: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3482: thrust::copy(titb, tite, rT);
3483: }
3484: auto cT = CcsrT->column_indices->begin();
3485: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3486: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3487: auto vT = CcsrT->values->begin();
3488: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3489: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3490: PetscCall(PetscLogGpuTimeEnd());
3492: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
3493: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
3494: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3495: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3496: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3497: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3498: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3499: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3500: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3501: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3502: PetscCallCUSPARSE(stat);
3503: Ccusp->matTranspose = CmatT;
3504: }
3505: }
3507: c->free_a = PETSC_TRUE;
3508: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3509: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3510: c->free_ij = PETSC_TRUE;
3511: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3512: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3513: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3514: ii = *Ccsr->row_offsets;
3515: jj = *Ccsr->column_indices;
3516: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3517: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3518: } else {
3519: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3520: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3521: }
3522: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3523: PetscCall(PetscMalloc1(m, &c->ilen));
3524: PetscCall(PetscMalloc1(m, &c->imax));
3525: c->maxnz = c->nz;
3526: c->nonzerorowcnt = 0;
3527: c->rmax = 0;
3528: for (i = 0; i < m; i++) {
3529: const PetscInt nn = c->i[i + 1] - c->i[i];
3530: c->ilen[i] = c->imax[i] = nn;
3531: c->nonzerorowcnt += (PetscInt)!!nn;
3532: c->rmax = PetscMax(c->rmax, nn);
3533: }
3534: PetscCall(PetscMalloc1(c->nz, &c->a));
3535: (*C)->nonzerostate++;
3536: PetscCall(PetscLayoutSetUp((*C)->rmap));
3537: PetscCall(PetscLayoutSetUp((*C)->cmap));
3538: Ccusp->nonzerostate = (*C)->nonzerostate;
3539: (*C)->preallocated = PETSC_TRUE;
3540: } else {
3541: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3542: c = (Mat_SeqAIJ *)(*C)->data;
3543: if (c->nz) {
3544: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3545: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3546: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3547: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3548: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3549: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3550: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3551: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3552: Acsr = (CsrMatrix *)Acusp->mat->mat;
3553: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3554: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3555: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3556: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3557: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3558: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3559: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3560: auto pmid = Ccusp->coords->begin();
3561: #if CCCL_VERSION >= 3001000
3562: cuda::std::advance(pmid, Acsr->num_entries);
3563: #else
3564: thrust::advance(pmid, Acsr->num_entries);
3565: #endif
3566: PetscCall(PetscLogGpuTimeBegin());
3567: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3568: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3569: thrust::for_each(zibait, zieait, VecCUDAEquals());
3570: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3571: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3572: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
3573: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3574: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3575: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3576: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3577: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3578: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3579: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3580: auto vT = CcsrT->values->begin();
3581: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3582: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3583: (*C)->transupdated = PETSC_TRUE;
3584: }
3585: PetscCall(PetscLogGpuTimeEnd());
3586: }
3587: }
3588: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3589: (*C)->assembled = PETSC_TRUE;
3590: (*C)->was_assembled = PETSC_FALSE;
3591: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
3592: PetscFunctionReturn(PETSC_SUCCESS);
3593: }
3595: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3596: {
3597: PetscFunctionBegin;
3598: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3599: PetscFunctionReturn(PETSC_SUCCESS);
3600: }