Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <../src/mat/impls/aij/seq/cupm/aijcupm.hpp>
15: #include <thrust/adjacent_difference.h>
16: #if PETSC_CPP_VERSION >= 14
17: #define PETSC_HAVE_THRUST_ASYNC 1
18: // thrust::for_each(thrust::cuda::par.on()) requires C++14
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/remove.h>
22: #include <thrust/sort.h>
23: #include <thrust/tuple.h>
24: #include <thrust/unique.h>
25: #include <thrust/gather.h>
26: #include <thrust/binary_search.h> // for thrust::lower_bound
27: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0)
28: #include <cuda/std/functional>
29: #endif
31: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
32: /*
33: The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
34: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
35: */
36: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
37: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
38: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
40: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
41: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
42: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
43: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
44: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
45: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
46: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
47: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
48: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
49: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
50: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
51: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
52: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
53: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
56: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
57: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
58: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
59: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
61: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
62: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
64: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
65: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
66: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
67: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
69: // cusparseCreateCsr() separates types for row offsets and column indices in prototype, but requires them to have the same type at runtime!
70: const cusparseIndexType_t csrRowOffsetsType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
71: const cusparseIndexType_t csrColIndType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
73: using Csr2coo = Petsc::mat::aij::cupm::impl::Csr2coo;
74: using PetscIntToCInt = Petsc::mat::aij::cupm::impl::PetscIntToCInt;
76: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
77: {
78: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
80: PetscFunctionBegin;
81: switch (op) {
82: case MAT_CUSPARSE_MULT:
83: cusparsestruct->format = format;
84: break;
85: case MAT_CUSPARSE_ALL:
86: cusparsestruct->format = format;
87: break;
88: default:
89: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
90: }
91: PetscFunctionReturn(PETSC_SUCCESS);
92: }
94: /*@
95: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
96: operation. Only the `MatMult()` operation can use different GPU storage formats
98: Not Collective
100: Input Parameters:
101: + A - Matrix of type `MATSEQAIJCUSPARSE`
102: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
103: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
104: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
106: Level: intermediate
108: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
109: @*/
110: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
111: {
112: PetscFunctionBegin;
114: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
115: PetscFunctionReturn(PETSC_SUCCESS);
116: }
118: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
119: {
120: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
122: PetscFunctionBegin;
123: cusparsestruct->use_cpu_solve = use_cpu;
124: PetscFunctionReturn(PETSC_SUCCESS);
125: }
127: /*@
128: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
130: Input Parameters:
131: + A - Matrix of type `MATSEQAIJCUSPARSE`
132: - use_cpu - set flag for using the built-in CPU `MatSolve()`
134: Level: intermediate
136: Note:
137: The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
138: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
139: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
141: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
142: @*/
143: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
144: {
145: PetscFunctionBegin;
147: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
148: PetscFunctionReturn(PETSC_SUCCESS);
149: }
151: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
152: {
153: PetscFunctionBegin;
154: switch (op) {
155: case MAT_FORM_EXPLICIT_TRANSPOSE:
156: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
157: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
158: A->form_explicit_transpose = flg;
159: break;
160: default:
161: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
162: break;
163: }
164: PetscFunctionReturn(PETSC_SUCCESS);
165: }
167: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
168: {
169: MatCUSPARSEStorageFormat format;
170: PetscBool flg;
171: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
173: PetscFunctionBegin;
174: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
175: if (A->factortype == MAT_FACTOR_NONE) {
176: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
177: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
179: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
180: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
181: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
182: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
183: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
184: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
185: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
187: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
188: PetscCall(
189: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
190: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
191: }
192: PetscOptionsHeadEnd();
193: PetscFunctionReturn(PETSC_SUCCESS);
194: }
196: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
197: {
198: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
199: PetscInt m = A->rmap->n;
200: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
201: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
202: const MatScalar *Aa = a->a;
203: PetscInt *Mi, *Mj, Mnz;
204: PetscScalar *Ma;
206: PetscFunctionBegin;
207: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
208: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
209: if (!fs->csrRowPtr) { // Is this the first time we are doing setup? Use csrRowPtr since it is not null even when m=0
210: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
211: Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
212: PetscCall(PetscMalloc1(m + 1, &Mi));
213: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
214: PetscCall(PetscMalloc1(Mnz, &Ma));
215: Mi[0] = 0;
216: for (PetscInt i = 0; i < m; i++) {
217: PetscInt llen = Ai[i + 1] - Ai[i];
218: PetscInt ulen = adiag[i] - adiag[i + 1];
219: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
220: Mj[Mi[i] + llen] = i; // diagonal entry
221: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
222: Mi[i + 1] = Mi[i] + llen + ulen;
223: }
224: // Copy M (L,U) from host to device
225: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
226: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
227: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
228: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
229: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
231: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
232: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
233: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
234: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
235: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
236: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
237: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
239: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
240: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
241: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
243: fillMode = CUSPARSE_FILL_MODE_UPPER;
244: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
245: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
246: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
247: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
249: // Allocate work vectors in SpSv
250: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
251: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
253: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
254: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
256: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
257: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
258: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
259: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
260: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
261: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
262: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
264: // Record for reuse
265: fs->csrRowPtr_h = Mi;
266: fs->csrVal_h = Ma;
267: PetscCall(PetscFree(Mj));
268: }
269: // Copy the value
270: Mi = fs->csrRowPtr_h;
271: Ma = fs->csrVal_h;
272: Mnz = Mi[m];
273: for (PetscInt i = 0; i < m; i++) {
274: PetscInt llen = Ai[i + 1] - Ai[i];
275: PetscInt ulen = adiag[i] - adiag[i + 1];
276: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
277: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
278: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
279: }
280: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
282: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
283: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
284: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
285: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
286: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
287: } else
288: #endif
289: {
290: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
291: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
293: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
294: fs->updatedSpSVAnalysis = PETSC_TRUE;
295: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
296: }
297: }
298: PetscFunctionReturn(PETSC_SUCCESS);
299: }
301: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
302: {
303: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
304: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
305: IS isrow = a->row, isicol = a->icol;
306: PetscBool row_identity, col_identity;
307: PetscInt n = A->rmap->n;
309: PetscFunctionBegin;
310: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
311: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
313: cusparseTriFactors->nnz = a->nz;
315: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
316: /* lower triangular indices */
317: PetscCall(ISIdentity(isrow, &row_identity));
318: if (!row_identity && !cusparseTriFactors->rpermIndices) {
319: const PetscInt *r;
321: PetscCall(ISGetIndices(isrow, &r));
322: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
323: cusparseTriFactors->rpermIndices->assign(r, r + n);
324: PetscCall(ISRestoreIndices(isrow, &r));
325: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
326: }
328: /* upper triangular indices */
329: PetscCall(ISIdentity(isicol, &col_identity));
330: if (!col_identity && !cusparseTriFactors->cpermIndices) {
331: const PetscInt *c;
333: PetscCall(ISGetIndices(isicol, &c));
334: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
335: cusparseTriFactors->cpermIndices->assign(c, c + n);
336: PetscCall(ISRestoreIndices(isicol, &c));
337: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
338: }
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
343: {
344: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
345: PetscInt m = A->rmap->n;
346: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
347: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
348: const MatScalar *Aa = a->a;
349: PetscInt *Mj, Mnz;
350: PetscScalar *Ma, *D;
352: PetscFunctionBegin;
353: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
354: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
355: if (!fs->csrRowPtr) { // Is this the first time we are doing setup? Use csrRowPtr since it is not null even m=0
356: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
357: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
358: Mnz = Ai[m]; // Unz (with the unit diagonal)
359: PetscCall(PetscMalloc1(Mnz, &Ma));
360: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
361: PetscCall(PetscMalloc1(m, &D)); // the diagonal
362: for (PetscInt i = 0; i < m; i++) {
363: PetscInt ulen = Ai[i + 1] - Ai[i];
364: Mj[Ai[i]] = i; // diagonal entry
365: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
366: }
367: // Copy M (U) from host to device
368: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
369: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
370: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
371: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
372: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
373: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
375: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
376: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
377: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
378: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
379: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
380: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
381: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
383: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
384: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
385: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
387: // Allocate work vectors in SpSv
388: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
389: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
391: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
392: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
394: // Query buffer sizes for SpSV and then allocate buffers
395: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
396: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
397: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
399: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
400: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
401: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
403: // Record for reuse
404: fs->csrVal_h = Ma;
405: fs->diag_h = D;
406: PetscCall(PetscFree(Mj));
407: }
408: // Copy the value
409: Ma = fs->csrVal_h;
410: D = fs->diag_h;
411: Mnz = Ai[m];
412: for (PetscInt i = 0; i < m; i++) {
413: D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
414: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
415: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
416: }
417: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
418: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
420: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
421: if (fs->updatedSpSVAnalysis) {
422: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
423: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
424: } else
425: #endif
426: {
427: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
428: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
429: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
430: fs->updatedSpSVAnalysis = PETSC_TRUE;
431: }
432: }
433: PetscFunctionReturn(PETSC_SUCCESS);
434: }
436: // Solve Ut D U x = b
437: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
438: {
439: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
440: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
441: const PetscScalar *barray;
442: PetscScalar *xarray;
443: thrust::device_ptr<const PetscScalar> bGPU;
444: thrust::device_ptr<PetscScalar> xGPU;
445: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
446: PetscInt m = A->rmap->n;
448: PetscFunctionBegin;
449: PetscCall(PetscLogGpuTimeBegin());
450: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
451: PetscCall(VecCUDAGetArrayRead(b, &barray));
452: xGPU = thrust::device_pointer_cast(xarray);
453: bGPU = thrust::device_pointer_cast(barray);
455: // Reorder b with the row permutation if needed, and wrap the result in fs->X
456: if (fs->rpermIndices) {
457: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
458: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
459: } else {
460: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
461: }
463: // Solve Ut Y = X
464: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
465: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
467: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
468: // It is basically a vector element-wise multiplication, but cublas does not have it!
469: #if CCCL_VERSION >= 3001000
470: auto multiplies = cuda::std::multiplies<PetscScalar>();
471: #else
472: auto multiplies = thrust::multiplies<PetscScalar>();
473: #endif
474: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), multiplies));
476: // Solve U X = Y
477: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
478: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
479: } else {
480: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
481: }
482: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
484: // Reorder X with the column permutation if needed, and put the result back to x
485: if (fs->cpermIndices) {
486: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
487: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
488: }
490: PetscCall(VecCUDARestoreArrayRead(b, &barray));
491: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
492: PetscCall(PetscLogGpuTimeEnd());
493: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
494: PetscFunctionReturn(PETSC_SUCCESS);
495: }
497: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
498: {
499: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
500: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
501: IS ip = a->row;
502: PetscBool perm_identity;
503: PetscInt n = A->rmap->n;
505: PetscFunctionBegin;
506: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
508: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
509: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
511: A->offloadmask = PETSC_OFFLOAD_BOTH;
513: /* lower triangular indices */
514: PetscCall(ISIdentity(ip, &perm_identity));
515: if (!perm_identity) {
516: IS iip;
517: const PetscInt *irip, *rip;
519: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
520: PetscCall(ISGetIndices(iip, &irip));
521: PetscCall(ISGetIndices(ip, &rip));
522: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
523: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
524: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
525: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
526: PetscCall(ISRestoreIndices(iip, &irip));
527: PetscCall(ISDestroy(&iip));
528: PetscCall(ISRestoreIndices(ip, &rip));
529: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
530: }
531: PetscFunctionReturn(PETSC_SUCCESS);
532: }
534: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
535: {
536: PetscFunctionBegin;
537: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
538: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
539: B->offloadmask = PETSC_OFFLOAD_CPU;
540: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
541: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; // since symmetric
542: B->ops->matsolve = NULL;
543: B->ops->matsolvetranspose = NULL;
544: /* get the triangular factors */
545: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
546: PetscFunctionReturn(PETSC_SUCCESS);
547: }
549: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
550: {
551: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
552: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
553: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
554: cusparseIndexBase_t indexBase;
556: PetscFunctionBegin;
557: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
558: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
559: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
560: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
561: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
562: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
563: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
564: PetscCall(PetscLogGpuTimeBegin());
565: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
566: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
567: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
568: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
569: indexBase = cusparseGetMatIndexBase(matstruct->descr);
570: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
571: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
573: /* set alpha and beta */
574: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
575: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
576: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
577: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
578: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
579: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
581: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
582: CsrMatrix *matrixT = new CsrMatrix;
583: matstructT->mat = matrixT;
584: matrixT->num_rows = A->cmap->n;
585: matrixT->num_cols = A->rmap->n;
586: matrixT->num_entries = a->nz;
587: matrixT->row_offsets = new THRUSTINTARRAY(matrixT->num_rows + 1);
588: matrixT->column_indices = new THRUSTINTARRAY(a->nz);
589: matrixT->values = new THRUSTARRAY(a->nz);
591: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
592: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
593: PetscCallCUSPARSE(cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), csrRowOffsetsType, csrColIndType, indexBase, cusparse_scalartype));
594: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
595: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
596: }
597: }
598: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
599: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
600: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
601: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
602: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
603: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
604: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
605: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
606: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
607: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
608: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
609: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
610: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
611: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
612: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
613: }
614: if (!cusparsestruct->csr2csc_i) { // not using cusparseCsr2cscEx2() because it requires 32-bit indices
615: THRUSTINTARRAY row_indices(matrix->num_entries);
617: // Transpose the matrix via COO, i.e., by putting the row indices in column_indices[] and the column indices in row_indices[]
618: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); // will store the matrix to matrixT permutation, i.e., entry matrixT[i] is matrix[csr2csc_i[i]]
619: PetscCallThrust(thrust::sequence(thrust::device, cusparsestruct->csr2csc_i->begin(), cusparsestruct->csr2csc_i->end()));
620: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->rmap->n), Csr2coo(cusparsestruct->rowoffsets_gpu->data().get(), matrixT->column_indices->data().get())));
621: row_indices = *matrix->column_indices;
622: // Sort the COO by row then column, and get the permutation csr2csc_i[]
623: PetscCallThrust(thrust::sort_by_key(thrust::device, thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), matrixT->column_indices->begin())), thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), matrixT->column_indices->end())),
624: cusparsestruct->csr2csc_i->begin()));
625: // Finalize matrixT's row_offsets by looking up row_indices[]
626: PetscCallThrust(thrust::lower_bound(thrust::device, row_indices.begin(), row_indices.end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(A->cmap->n + 1), matrixT->row_offsets->begin()));
627: }
628: PetscCallThrust(thrust::gather(thrust::device, cusparsestruct->csr2csc_i->begin(), cusparsestruct->csr2csc_i->end(), matrix->values->begin(), matrixT->values->begin()));
629: }
630: PetscCall(PetscLogGpuTimeEnd());
631: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
632: /* the compressed row indices is not used for matTranspose */
633: matstructT->cprowIndices = NULL;
634: /* assign the pointer */
635: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
636: A->transupdated = PETSC_TRUE;
637: PetscFunctionReturn(PETSC_SUCCESS);
638: }
640: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
641: {
642: const PetscScalar *barray;
643: PetscScalar *xarray;
644: thrust::device_ptr<const PetscScalar> bGPU;
645: thrust::device_ptr<PetscScalar> xGPU;
646: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
647: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
648: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
649: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
650: PetscInt m = A->rmap->n;
652: PetscFunctionBegin;
653: PetscCall(PetscLogGpuTimeBegin());
654: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
655: PetscCall(VecCUDAGetArrayRead(b, &barray));
656: xGPU = thrust::device_pointer_cast(xarray);
657: bGPU = thrust::device_pointer_cast(barray);
659: // Reorder b with the row permutation if needed, and wrap the result in fs->X
660: if (fs->rpermIndices) {
661: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
662: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
663: } else {
664: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
665: }
667: // Solve L Y = X
668: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
669: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
670: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
672: // Solve U X = Y
673: if (fs->cpermIndices) {
674: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
675: } else {
676: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
677: }
678: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
680: // Reorder X with the column permutation if needed, and put the result back to x
681: if (fs->cpermIndices) {
682: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
683: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
684: }
685: PetscCall(VecCUDARestoreArrayRead(b, &barray));
686: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
687: PetscCall(PetscLogGpuTimeEnd());
688: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
689: PetscFunctionReturn(PETSC_SUCCESS);
690: }
692: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
693: {
694: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
695: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
696: const PetscScalar *barray;
697: PetscScalar *xarray;
698: thrust::device_ptr<const PetscScalar> bGPU;
699: thrust::device_ptr<PetscScalar> xGPU;
700: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
701: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
702: PetscInt m = A->rmap->n;
704: PetscFunctionBegin;
705: PetscCall(PetscLogGpuTimeBegin());
706: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
707: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
708: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
709: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
711: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
712: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
713: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
714: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
715: fs->createdTransposeSpSVDescr = PETSC_TRUE;
716: }
718: if (!fs->updatedTransposeSpSVAnalysis) {
719: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
721: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
722: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
723: }
725: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
726: PetscCall(VecCUDAGetArrayRead(b, &barray));
727: xGPU = thrust::device_pointer_cast(xarray);
728: bGPU = thrust::device_pointer_cast(barray);
730: // Reorder b with the row permutation if needed, and wrap the result in fs->X
731: if (fs->rpermIndices) {
732: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
733: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
734: } else {
735: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
736: }
738: // Solve Ut Y = X
739: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
740: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
742: // Solve Lt X = Y
743: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
744: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
745: } else {
746: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
747: }
748: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
750: // Reorder X with the column permutation if needed, and put the result back to x
751: if (fs->cpermIndices) {
752: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
753: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
754: }
756: PetscCall(VecCUDARestoreArrayRead(b, &barray));
757: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
758: PetscCall(PetscLogGpuTimeEnd());
759: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
760: PetscFunctionReturn(PETSC_SUCCESS);
761: }
763: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
764: {
765: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
766: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
767: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
768: CsrMatrix *Acsr;
769: PetscInt m, nz;
770: PetscBool flg;
772: PetscFunctionBegin;
773: if (PetscDefined(USE_DEBUG)) {
774: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
775: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
776: }
778: /* Copy A's value to fact */
779: m = fact->rmap->n;
780: nz = aij->nz;
781: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
782: Acsr = (CsrMatrix *)Acusp->mat->mat;
783: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
785: PetscCall(PetscLogGpuTimeBegin());
786: /* Factorize fact inplace */
787: if (m)
788: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
789: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
790: if (PetscDefined(USE_DEBUG)) {
791: int numerical_zero;
792: cusparseStatus_t status;
793: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
794: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
795: }
797: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
798: if (fs->updatedSpSVAnalysis) {
799: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
800: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
801: } else
802: #endif
803: {
804: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
805: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
806: */
807: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
809: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
811: fs->updatedSpSVAnalysis = PETSC_TRUE;
812: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
813: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
814: }
816: fact->offloadmask = PETSC_OFFLOAD_GPU;
817: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
818: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
819: fact->ops->matsolve = NULL;
820: fact->ops->matsolvetranspose = NULL;
821: PetscCall(PetscLogGpuTimeEnd());
822: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
823: PetscFunctionReturn(PETSC_SUCCESS);
824: }
826: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
827: {
828: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
829: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
830: PetscInt m, nz;
832: PetscFunctionBegin;
833: if (PetscDefined(USE_DEBUG)) {
834: PetscBool flg, diagDense;
836: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
837: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
838: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
839: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
840: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
841: }
843: /* Free the old stale stuff */
844: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
846: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
847: but they will not be used. Allocate them just for easy debugging.
848: */
849: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
851: fact->offloadmask = PETSC_OFFLOAD_BOTH;
852: fact->factortype = MAT_FACTOR_ILU;
853: fact->info.factor_mallocs = 0;
854: fact->info.fill_ratio_given = info->fill;
855: fact->info.fill_ratio_needed = 1.0;
857: aij->row = NULL;
858: aij->col = NULL;
860: /* ====================================================================== */
861: /* Copy A's i, j to fact and also allocate the value array of fact. */
862: /* We'll do in-place factorization on fact */
863: /* ====================================================================== */
864: const PetscInt *Ai, *Aj;
866: m = fact->rmap->n;
867: nz = aij->nz;
869: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
870: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
871: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
872: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed
874: PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
875: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
876: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));
878: /* ====================================================================== */
879: /* Create descriptors for M, L, U */
880: /* ====================================================================== */
881: cusparseFillMode_t fillMode;
882: cusparseDiagType_t diagType;
884: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
885: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
886: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
888: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
889: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
890: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
891: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
892: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
893: */
894: fillMode = CUSPARSE_FILL_MODE_LOWER;
895: diagType = CUSPARSE_DIAG_TYPE_UNIT;
896: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
897: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
898: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
900: fillMode = CUSPARSE_FILL_MODE_UPPER;
901: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
902: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
903: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
904: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
906: /* ========================================================================= */
907: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
908: /* ========================================================================= */
909: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
910: if (m)
911: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
912: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
914: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
915: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
917: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
918: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
920: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
921: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
923: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
924: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
926: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
927: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
928: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
929: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
930: */
931: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
932: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
933: fs->spsvBuffer_L = fs->factBuffer_M;
934: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
935: } else {
936: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
937: fs->spsvBuffer_U = fs->factBuffer_M;
938: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
939: }
941: /* ========================================================================== */
942: /* Perform analysis of ilu0 on M, SpSv on L and U */
943: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
944: /* ========================================================================== */
945: int structural_zero;
946: cusparseStatus_t status;
948: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
949: if (m)
950: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
951: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
952: if (PetscDefined(USE_DEBUG)) {
953: /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
954: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
955: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
956: }
958: /* Estimate FLOPs of the numeric factorization */
959: {
960: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
961: PetscInt *Ai, nzRow, nzLeft;
962: const PetscInt *adiag;
963: PetscLogDouble flops = 0.0;
965: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
966: Ai = Aseq->i;
967: for (PetscInt i = 0; i < m; i++) {
968: if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
969: nzRow = Ai[i + 1] - Ai[i];
970: nzLeft = adiag[i] - Ai[i];
971: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
972: and include the eliminated one will be updated, which incurs a multiplication and an addition.
973: */
974: nzLeft = (nzRow - 1) / 2;
975: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
976: }
977: }
978: fs->numericFactFlops = flops;
979: }
980: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
981: PetscFunctionReturn(PETSC_SUCCESS);
982: }
984: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
985: {
986: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
987: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
988: const PetscScalar *barray;
989: PetscScalar *xarray;
991: PetscFunctionBegin;
992: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
993: PetscCall(VecCUDAGetArrayRead(b, &barray));
994: PetscCall(PetscLogGpuTimeBegin());
996: /* Solve L*y = b */
997: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
998: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
999: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1000: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1002: /* Solve Lt*x = y */
1003: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1004: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1005: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1007: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1008: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1010: PetscCall(PetscLogGpuTimeEnd());
1011: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1012: PetscFunctionReturn(PETSC_SUCCESS);
1013: }
1015: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1016: {
1017: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1018: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1019: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1020: CsrMatrix *Acsr;
1021: PetscInt m, nz;
1022: PetscBool flg;
1024: PetscFunctionBegin;
1025: if (PetscDefined(USE_DEBUG)) {
1026: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1027: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1028: }
1030: /* Copy A's value to fact */
1031: m = fact->rmap->n;
1032: nz = aij->nz;
1033: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1034: Acsr = (CsrMatrix *)Acusp->mat->mat;
1035: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1037: /* Factorize fact inplace */
1038: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1039: csric02() only takes the lower triangular part of matrix A to perform factorization.
1040: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1041: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1042: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1043: */
1044: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1045: if (PetscDefined(USE_DEBUG)) {
1046: int numerical_zero;
1047: cusparseStatus_t status;
1048: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1049: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1050: }
1052: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1053: if (fs->updatedSpSVAnalysis) {
1054: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1055: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1056: } else
1057: #endif
1058: {
1059: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1061: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1062: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1063: */
1064: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1065: fs->updatedSpSVAnalysis = PETSC_TRUE;
1066: }
1068: fact->offloadmask = PETSC_OFFLOAD_GPU;
1069: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1070: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1071: fact->ops->matsolve = NULL;
1072: fact->ops->matsolvetranspose = NULL;
1073: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1074: PetscFunctionReturn(PETSC_SUCCESS);
1075: }
1077: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1078: {
1079: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1080: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1081: PetscInt m, nz;
1083: PetscFunctionBegin;
1084: if (PetscDefined(USE_DEBUG)) {
1085: PetscBool flg, diagDense;
1087: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1088: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1089: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1090: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1091: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
1092: }
1094: /* Free the old stale stuff */
1095: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1097: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1098: but they will not be used. Allocate them just for easy debugging.
1099: */
1100: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1102: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1103: fact->factortype = MAT_FACTOR_ICC;
1104: fact->info.factor_mallocs = 0;
1105: fact->info.fill_ratio_given = info->fill;
1106: fact->info.fill_ratio_needed = 1.0;
1108: aij->row = NULL;
1109: aij->col = NULL;
1111: /* ====================================================================== */
1112: /* Copy A's i, j to fact and also allocate the value array of fact. */
1113: /* We'll do in-place factorization on fact */
1114: /* ====================================================================== */
1115: const PetscInt *Ai, *Aj;
1117: m = fact->rmap->n;
1118: nz = aij->nz;
1120: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1121: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1122: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1123: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); // Ai is uncompressed
1125: PetscCheck(nz <= INT_MAX && m <= INT_MAX, PETSC_COMM_SELF, PETSC_ERR_SUP, "nnz %" PetscInt_FMT " and rows %" PetscInt_FMT " overflow C int", nz, m);
1126: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Ai, Ai + m + 1, fs->csrRowPtr32, PetscIntToCInt()));
1127: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), Aj, Aj + nz, fs->csrColIdx32, PetscIntToCInt()));
1129: /* ====================================================================== */
1130: /* Create mat descriptors for M, L */
1131: /* ====================================================================== */
1132: cusparseFillMode_t fillMode;
1133: cusparseDiagType_t diagType;
1135: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1136: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1137: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1139: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1140: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1141: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1142: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1143: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1144: */
1145: fillMode = CUSPARSE_FILL_MODE_LOWER;
1146: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1147: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1148: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1149: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1151: /* ========================================================================= */
1152: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1153: /* ========================================================================= */
1154: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
1155: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
1157: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1158: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1160: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1161: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1163: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1164: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1166: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1167: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1169: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1170: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1171: */
1172: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1173: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1174: fs->spsvBuffer_L = fs->factBuffer_M;
1175: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1176: } else {
1177: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
1178: fs->spsvBuffer_Lt = fs->factBuffer_M;
1179: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1180: }
1182: /* ========================================================================== */
1183: /* Perform analysis of ic0 on M */
1184: /* The lower triangular part of M has the same sparsity pattern as L */
1185: /* ========================================================================== */
1186: int structural_zero;
1187: cusparseStatus_t status;
1189: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1190: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1191: if (PetscDefined(USE_DEBUG)) {
1192: /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1193: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1194: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1195: }
1197: /* Estimate FLOPs of the numeric factorization */
1198: {
1199: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1200: PetscInt *Ai, nzRow, nzLeft;
1201: PetscLogDouble flops = 0.0;
1203: Ai = Aseq->i;
1204: for (PetscInt i = 0; i < m; i++) {
1205: nzRow = Ai[i + 1] - Ai[i];
1206: if (nzRow > 1) {
1207: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1208: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1209: */
1210: nzLeft = (nzRow - 1) / 2;
1211: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1212: }
1213: }
1214: fs->numericFactFlops = flops;
1215: }
1216: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1217: PetscFunctionReturn(PETSC_SUCCESS);
1218: }
1220: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1221: {
1222: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
1223: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1225: PetscFunctionBegin;
1226: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1227: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1228: B->offloadmask = PETSC_OFFLOAD_CPU;
1230: if (!cusparsestruct->use_cpu_solve) {
1231: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
1232: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1233: }
1234: B->ops->matsolve = NULL;
1235: B->ops->matsolvetranspose = NULL;
1237: /* get the triangular factors */
1238: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
1239: PetscFunctionReturn(PETSC_SUCCESS);
1240: }
1242: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1243: {
1244: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
1246: PetscFunctionBegin;
1247: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1248: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1249: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1250: PetscFunctionReturn(PETSC_SUCCESS);
1251: }
1253: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1254: {
1255: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1257: PetscFunctionBegin;
1258: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1259: if (!info->factoronhost) {
1260: PetscCall(ISIdentity(isrow, &row_identity));
1261: PetscCall(ISIdentity(iscol, &col_identity));
1262: }
1263: if (!info->levels && row_identity && col_identity) {
1264: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1265: } else {
1266: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1267: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1268: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1269: }
1270: PetscFunctionReturn(PETSC_SUCCESS);
1271: }
1273: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1274: {
1275: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1277: PetscFunctionBegin;
1278: PetscBool perm_identity = PETSC_FALSE;
1279: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
1280: if (!info->levels && perm_identity) {
1281: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1282: } else {
1283: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1284: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1285: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1286: }
1287: PetscFunctionReturn(PETSC_SUCCESS);
1288: }
1290: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1291: {
1292: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1294: PetscFunctionBegin;
1295: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1296: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1297: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1298: PetscFunctionReturn(PETSC_SUCCESS);
1299: }
1301: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1302: {
1303: PetscFunctionBegin;
1304: *type = MATSOLVERCUSPARSE;
1305: PetscFunctionReturn(PETSC_SUCCESS);
1306: }
1308: /*MC
1309: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1310: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1311: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1312: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1313: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1314: algorithms are not recommended. This class does NOT support direct solver operations.
1316: Level: beginner
1318: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
1319: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1320: M*/
1322: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1323: {
1324: PetscInt n = A->rmap->n;
1326: PetscFunctionBegin;
1327: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1328: PetscCall(MatSetSizes(*B, n, n, n, n));
1329: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
1330: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1332: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1333: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1334: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1335: if (!A->boundtocpu) {
1336: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1337: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1338: } else {
1339: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1340: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1341: }
1342: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1343: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1344: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1345: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1346: if (!A->boundtocpu) {
1347: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1348: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1349: } else {
1350: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1351: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1352: }
1353: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1354: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1355: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
1357: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
1358: (*B)->canuseordering = PETSC_TRUE;
1359: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
1360: PetscFunctionReturn(PETSC_SUCCESS);
1361: }
1363: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1364: {
1365: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1366: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1367: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1369: PetscFunctionBegin;
1370: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1371: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1372: if (A->factortype == MAT_FACTOR_NONE) {
1373: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1374: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1375: } else if (fs->csrVal) {
1376: /* We have a factorized matrix on device and are able to copy it to host */
1377: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1378: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
1379: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
1380: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
1381: A->offloadmask = PETSC_OFFLOAD_BOTH;
1382: }
1383: PetscFunctionReturn(PETSC_SUCCESS);
1384: }
1386: /* Policy struct for MatSeqAIJCUSPARSE_CUPM shared template (CUDA specialisation) */
1387: struct MatSeqAIJCUSPARSE_Policy {
1388: typedef Mat_SeqAIJCUSPARSE mat_struct_type;
1389: typedef Mat_SeqAIJCUSPARSEMultStruct mult_struct_type;
1391: static int storage_format_csr() { return (int)MAT_CUSPARSE_CSR; }
1392: static int storage_format_ell() { return (int)MAT_CUSPARSE_ELL; }
1393: static int storage_format_hyb() { return (int)MAT_CUSPARSE_HYB; }
1395: static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
1396: static PetscErrorCode CopyFromGPU(Mat A) { return MatSeqAIJCUSPARSECopyFromGPU(A); }
1397: static PetscErrorCode InvalidateTranspose(Mat A, PetscBool d) { return MatSeqAIJCUSPARSEInvalidateTranspose(A, d); }
1398: static PetscErrorCode ConvertFromSeqAIJ(Mat B, MatType t, MatReuse r, Mat *C) { return MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, t, r, C); }
1399: static const char *mat_type_name;
1401: static PetscErrorCode Destroy(Mat A) { return MatSeqAIJCUSPARSE_Destroy(A); }
1402: static PetscErrorCode TriFactorsDestroy(void **spptr) { return MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)spptr); }
1403: static const char *set_format_c;
1404: static const char *set_use_cpu_solve_c;
1405: static const char *product_seqdense_device_c;
1406: static const char *product_seqdense_c;
1407: static const char *product_self_c;
1408: static const char *seq_convert_hypre_c;
1410: static PetscErrorCode VecGetArrayRead(Vec v, const PetscScalar **a) { return VecCUDAGetArrayRead(v, a); }
1411: static PetscErrorCode VecRestoreArrayRead(Vec v, const PetscScalar **a) { return VecCUDARestoreArrayRead(v, a); }
1412: static PetscErrorCode VecGetArrayWrite(Vec v, PetscScalar **a) { return VecCUDAGetArrayWrite(v, a); }
1413: static PetscErrorCode VecRestoreArrayWrite(Vec v, PetscScalar **a) { return VecCUDARestoreArrayWrite(v, a); }
1414: };
1415: const char *MatSeqAIJCUSPARSE_Policy::mat_type_name = MATSEQAIJCUSPARSE;
1416: const char *MatSeqAIJCUSPARSE_Policy::set_format_c = "MatCUSPARSESetFormat_C";
1417: const char *MatSeqAIJCUSPARSE_Policy::set_use_cpu_solve_c = "MatCUSPARSESetUseCPUSolve_C";
1418: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_device_c = "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C";
1419: const char *MatSeqAIJCUSPARSE_Policy::product_seqdense_c = "MatProductSetFromOptions_seqaijcusparse_seqdense_C";
1420: const char *MatSeqAIJCUSPARSE_Policy::product_self_c = "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C";
1421: const char *MatSeqAIJCUSPARSE_Policy::seq_convert_hypre_c = "MatConvert_seqaijcusparse_hypre_C";
1423: using MatSeqAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatSeqAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatSeqAIJCUSPARSE_Policy>;
1425: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1426: {
1427: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArray(A, array);
1428: }
1430: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1431: {
1432: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArray(A, array);
1433: }
1435: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1436: {
1437: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayRead(A, array);
1438: }
1440: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
1441: {
1442: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayRead(A, array);
1443: }
1445: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1446: {
1447: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJGetArrayWrite(A, array);
1448: }
1450: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
1451: {
1452: return MatSeqAIJCUSPARSE_CUPM_t::SeqAIJRestoreArrayWrite(A, array);
1453: }
1455: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
1456: {
1457: Mat_SeqAIJCUSPARSE *cusp;
1458: CsrMatrix *matrix;
1460: PetscFunctionBegin;
1461: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1462: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
1463: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
1464: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
1465: matrix = (CsrMatrix *)cusp->mat->mat;
1467: if (i) *i = matrix->row_offsets->data().get();
1468: if (j) *j = matrix->column_indices->data().get();
1469: if (a) *a = matrix->values->data().get();
1470: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1471: PetscFunctionReturn(PETSC_SUCCESS);
1472: }
1474: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1475: {
1476: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1477: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1478: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1479: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
1480: PetscBool both = PETSC_TRUE;
1482: PetscFunctionBegin;
1483: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
1484: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1485: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1486: CsrMatrix *matrix;
1487: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
1489: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
1490: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1491: matrix->values->assign(a->a, a->a + a->nz);
1492: PetscCallCUDA(WaitForCUDA());
1493: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
1494: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1495: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
1496: } else {
1497: PetscInt nnz;
1498: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1499: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
1500: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1501: delete cusparsestruct->workVector;
1502: delete cusparsestruct->rowoffsets_gpu;
1503: cusparsestruct->workVector = NULL;
1504: cusparsestruct->rowoffsets_gpu = NULL;
1505: try {
1506: if (a->compressedrow.use) {
1507: m = a->compressedrow.nrows;
1508: ii = a->compressedrow.i;
1509: ridx = a->compressedrow.rindex;
1510: } else {
1511: m = A->rmap->n;
1512: ii = a->i;
1513: ridx = NULL;
1514: }
1515: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
1516: if (!a->a) {
1517: nnz = ii[m];
1518: both = PETSC_FALSE;
1519: } else nnz = a->nz;
1520: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
1522: /* create cusparse matrix */
1523: cusparsestruct->nrows = m;
1524: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1525: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1526: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1527: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1529: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
1530: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
1531: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
1532: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1533: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1534: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1535: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1537: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1538: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1539: /* set the matrix */
1540: CsrMatrix *mat = new CsrMatrix;
1541: mat->num_rows = m;
1542: mat->num_cols = A->cmap->n;
1543: mat->num_entries = nnz;
1544: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY(m + 1));
1545: mat->row_offsets->assign(ii, ii + m + 1);
1546: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY(nnz));
1547: mat->column_indices->assign(a->j, a->j + nnz);
1549: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
1550: if (a->a) mat->values->assign(a->a, a->a + nnz);
1552: /* assign the pointer */
1553: matstruct->mat = mat;
1554: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1555: PetscCallCUSPARSE(cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1556: }
1557: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1558: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1559: }
1561: /* assign the compressed row indices */
1562: if (a->compressedrow.use) {
1563: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
1564: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
1565: matstruct->cprowIndices->assign(ridx, ridx + m);
1566: tmp = m;
1567: } else {
1568: cusparsestruct->workVector = NULL;
1569: matstruct->cprowIndices = NULL;
1570: tmp = 0;
1571: }
1572: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
1574: /* assign the pointer */
1575: cusparsestruct->mat = matstruct;
1576: } catch (char *ex) {
1577: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
1578: }
1579: PetscCallCUDA(WaitForCUDA());
1580: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
1581: cusparsestruct->nonzerostate = A->nonzerostate;
1582: }
1583: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1584: }
1585: PetscFunctionReturn(PETSC_SUCCESS);
1586: }
1588: struct VecCUDAPlusEquals {
1589: template <typename Tuple>
1590: __host__ __device__ void operator()(Tuple t)
1591: {
1592: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1593: }
1594: };
1596: struct VecCUDAEquals {
1597: template <typename Tuple>
1598: __host__ __device__ void operator()(Tuple t)
1599: {
1600: thrust::get<1>(t) = thrust::get<0>(t);
1601: }
1602: };
1604: struct VecCUDAEqualsReverse {
1605: template <typename Tuple>
1606: __host__ __device__ void operator()(Tuple t)
1607: {
1608: thrust::get<0>(t) = thrust::get<1>(t);
1609: }
1610: };
1612: struct MatProductCtx_MatMatCusparse {
1613: PetscBool cisdense;
1614: PetscScalar *Bt;
1615: Mat X;
1616: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1617: PetscLogDouble flops;
1618: CsrMatrix *Bcsr;
1620: cusparseSpMatDescr_t matSpBDescr;
1621: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1622: cusparseDnMatDescr_t matBDescr;
1623: cusparseDnMatDescr_t matCDescr;
1624: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
1625: void *dBuffer4;
1626: void *dBuffer5;
1627: size_t mmBufferSize;
1628: void *mmBuffer;
1629: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1630: cusparseSpGEMMDescr_t spgemmDesc;
1631: };
1633: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
1634: {
1635: MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
1637: PetscFunctionBegin;
1638: PetscCallCUDA(cudaFree(mmdata->Bt));
1639: delete mmdata->Bcsr;
1640: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
1641: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1642: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1643: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
1644: PetscCallCUDA(cudaFree(mmdata->dBuffer4));
1645: PetscCallCUDA(cudaFree(mmdata->dBuffer5));
1646: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1647: PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
1648: PetscCall(MatDestroy(&mmdata->X));
1649: PetscCall(PetscFree(mmdata));
1650: PetscFunctionReturn(PETSC_SUCCESS);
1651: }
1653: #include <../src/mat/impls/dense/seq/dense.h>
1655: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1656: {
1657: Mat_Product *product = C->product;
1658: Mat A, B;
1659: PetscInt m, n, blda, clda;
1660: PetscBool flg, biscuda;
1661: Mat_SeqAIJCUSPARSE *cusp;
1662: cusparseOperation_t opA;
1663: const PetscScalar *barray;
1664: PetscScalar *carray;
1665: MatProductCtx_MatMatCusparse *mmdata;
1666: Mat_SeqAIJCUSPARSEMultStruct *mat;
1667: CsrMatrix *csrmat;
1669: PetscFunctionBegin;
1670: MatCheckProduct(C, 1);
1671: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1672: mmdata = (MatProductCtx_MatMatCusparse *)product->data;
1673: A = product->A;
1674: B = product->B;
1675: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1676: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1677: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1678: Instead of silently accepting the wrong answer, I prefer to raise the error */
1679: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1680: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1681: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1682: switch (product->type) {
1683: case MATPRODUCT_AB:
1684: case MATPRODUCT_PtAP:
1685: mat = cusp->mat;
1686: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1687: m = A->rmap->n;
1688: n = B->cmap->n;
1689: break;
1690: case MATPRODUCT_AtB:
1691: if (!A->form_explicit_transpose) {
1692: mat = cusp->mat;
1693: opA = CUSPARSE_OPERATION_TRANSPOSE;
1694: } else {
1695: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1696: mat = cusp->matTranspose;
1697: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1698: }
1699: m = A->cmap->n;
1700: n = B->cmap->n;
1701: break;
1702: case MATPRODUCT_ABt:
1703: case MATPRODUCT_RARt:
1704: mat = cusp->mat;
1705: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1706: m = A->rmap->n;
1707: n = B->rmap->n;
1708: break;
1709: default:
1710: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1711: }
1712: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
1713: csrmat = (CsrMatrix *)mat->mat;
1714: /* if the user passed a CPU matrix, copy the data to the GPU */
1715: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
1716: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
1717: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
1719: PetscCall(MatDenseGetLDA(B, &blda));
1720: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1721: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
1722: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
1723: } else {
1724: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
1725: PetscCall(MatDenseGetLDA(C, &clda));
1726: }
1728: PetscCall(PetscLogGpuTimeBegin());
1729: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1730: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
1731: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
1732: #else
1733: cusparseSpMatDescr_t &matADescr = mat->matDescr;
1734: #endif
1736: /* (re)allocate mmBuffer if not initialized or LDAs are different */
1737: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1738: size_t mmBufferSize;
1739: if (mmdata->initialized && mmdata->Blda != blda) {
1740: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
1741: mmdata->matBDescr = NULL;
1742: }
1743: if (!mmdata->matBDescr) {
1744: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1745: mmdata->Blda = blda;
1746: }
1748: if (mmdata->initialized && mmdata->Clda != clda) {
1749: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
1750: mmdata->matCDescr = NULL;
1751: }
1752: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1753: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
1754: mmdata->Clda = clda;
1755: }
1757: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
1758: if (matADescr) {
1759: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
1760: matADescr = NULL;
1761: }
1762: #endif
1764: if (!matADescr) {
1765: PetscCallCUSPARSE(cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1766: }
1768: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
1770: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
1771: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
1772: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
1773: mmdata->mmBufferSize = mmBufferSize;
1774: }
1776: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
1777: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1778: #endif
1780: mmdata->initialized = PETSC_TRUE;
1781: } else {
1782: /* to be safe, always update pointers of the mats */
1783: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
1784: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
1785: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
1786: }
1788: /* do cusparseSpMM, which supports transpose on B */
1789: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
1791: PetscCall(PetscLogGpuTimeEnd());
1792: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
1793: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
1794: if (product->type == MATPRODUCT_RARt) {
1795: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1796: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
1797: } else if (product->type == MATPRODUCT_PtAP) {
1798: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
1799: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
1800: } else {
1801: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
1802: }
1803: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
1804: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1805: PetscFunctionReturn(PETSC_SUCCESS);
1806: }
1808: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1809: {
1810: Mat_Product *product = C->product;
1811: Mat A, B;
1812: PetscInt m, n;
1813: PetscBool cisdense, flg;
1814: MatProductCtx_MatMatCusparse *mmdata;
1815: Mat_SeqAIJCUSPARSE *cusp;
1817: PetscFunctionBegin;
1818: MatCheckProduct(C, 1);
1819: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
1820: A = product->A;
1821: B = product->B;
1822: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1823: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1824: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1825: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1826: switch (product->type) {
1827: case MATPRODUCT_AB:
1828: m = A->rmap->n;
1829: n = B->cmap->n;
1830: PetscCall(MatSetBlockSizesFromMats(C, A, B));
1831: break;
1832: case MATPRODUCT_AtB:
1833: m = A->cmap->n;
1834: n = B->cmap->n;
1835: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
1836: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1837: break;
1838: case MATPRODUCT_ABt:
1839: m = A->rmap->n;
1840: n = B->rmap->n;
1841: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
1842: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1843: break;
1844: case MATPRODUCT_PtAP:
1845: m = B->cmap->n;
1846: n = B->cmap->n;
1847: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
1848: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
1849: break;
1850: case MATPRODUCT_RARt:
1851: m = B->rmap->n;
1852: n = B->rmap->n;
1853: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
1854: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
1855: break;
1856: default:
1857: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1858: }
1859: PetscCall(MatSetSizes(C, m, n, m, n));
1860: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1861: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
1862: PetscCall(MatSetType(C, MATSEQDENSECUDA));
1864: /* product data */
1865: PetscCall(PetscNew(&mmdata));
1866: mmdata->cisdense = cisdense;
1867: /* for these products we need intermediate storage */
1868: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1869: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
1870: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
1871: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1872: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
1873: } else {
1874: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
1875: }
1876: }
1877: C->product->data = mmdata;
1878: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
1880: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1881: PetscFunctionReturn(PETSC_SUCCESS);
1882: }
1884: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1885: {
1886: Mat_Product *product = C->product;
1887: Mat A, B;
1888: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
1889: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1890: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1891: CsrMatrix *Acsr, *Bcsr, *Ccsr;
1892: PetscBool flg;
1893: MatProductType ptype;
1894: MatProductCtx_MatMatCusparse *mmdata;
1895: cusparseSpMatDescr_t BmatSpDescr;
1896: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
1898: PetscFunctionBegin;
1899: MatCheckProduct(C, 1);
1900: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
1901: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
1902: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
1903: mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
1904: A = product->A;
1905: B = product->B;
1906: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
1907: mmdata->reusesym = PETSC_FALSE;
1908: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1909: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1910: Cmat = Ccusp->mat;
1911: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
1912: Ccsr = (CsrMatrix *)Cmat->mat;
1913: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1914: goto finalize;
1915: }
1916: if (!c->nz) goto finalize;
1917: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1918: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
1919: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
1920: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
1921: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1922: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1923: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1924: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
1925: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
1926: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1927: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1928: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
1929: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1930: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
1932: ptype = product->type;
1933: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
1934: ptype = MATPRODUCT_AB;
1935: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
1936: }
1937: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
1938: ptype = MATPRODUCT_AB;
1939: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
1940: }
1941: switch (ptype) {
1942: case MATPRODUCT_AB:
1943: Amat = Acusp->mat;
1944: Bmat = Bcusp->mat;
1945: break;
1946: case MATPRODUCT_AtB:
1947: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
1948: Amat = Acusp->matTranspose;
1949: Bmat = Bcusp->mat;
1950: break;
1951: case MATPRODUCT_ABt:
1952: Amat = Acusp->mat;
1953: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
1954: Bmat = Bcusp->matTranspose;
1955: break;
1956: default:
1957: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
1958: }
1959: Cmat = Ccusp->mat;
1960: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
1961: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
1962: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
1963: Acsr = (CsrMatrix *)Amat->mat;
1964: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
1965: Ccsr = (CsrMatrix *)Cmat->mat;
1966: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
1967: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
1968: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
1969: PetscCall(PetscLogGpuTimeBegin());
1970: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
1971: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
1972: PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
1973: PetscCallCUSPARSE(cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
1974: PetscCall(PetscLogGpuFlops(mmdata->flops));
1975: PetscCallCUDA(WaitForCUDA());
1976: PetscCall(PetscLogGpuTimeEnd());
1977: C->offloadmask = PETSC_OFFLOAD_GPU;
1978: finalize:
1979: /* shorter version of MatAssemblyEnd_SeqAIJ */
1980: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
1981: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
1982: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
1983: c->reallocs = 0;
1984: C->info.mallocs += 0;
1985: C->info.nz_unneeded = 0;
1986: C->assembled = C->was_assembled = PETSC_TRUE;
1987: C->num_ass++;
1988: PetscFunctionReturn(PETSC_SUCCESS);
1989: }
1991: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
1992: {
1993: Mat_Product *product = C->product;
1994: Mat A, B;
1995: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
1996: Mat_SeqAIJ *a, *b, *c;
1997: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
1998: CsrMatrix *Acsr, *Bcsr, *Ccsr;
1999: PetscInt i, j, m, n, k;
2000: PetscBool flg;
2001: MatProductType ptype;
2002: MatProductCtx_MatMatCusparse *mmdata;
2003: PetscLogDouble flops;
2004: PetscBool biscompressed, ciscompressed;
2005: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2006: cusparseSpMatDescr_t BmatSpDescr;
2007: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2008: size_t bufSize2;
2010: PetscFunctionBegin;
2011: MatCheckProduct(C, 1);
2012: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2013: A = product->A;
2014: B = product->B;
2015: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2016: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2017: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2018: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2019: a = (Mat_SeqAIJ *)A->data;
2020: b = (Mat_SeqAIJ *)B->data;
2021: /* product data */
2022: PetscCall(PetscNew(&mmdata));
2023: C->product->data = mmdata;
2024: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2026: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2027: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2028: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2029: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2030: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2031: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2033: ptype = product->type;
2034: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2035: ptype = MATPRODUCT_AB;
2036: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2037: }
2038: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2039: ptype = MATPRODUCT_AB;
2040: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2041: }
2042: biscompressed = PETSC_FALSE;
2043: ciscompressed = PETSC_FALSE;
2044: switch (ptype) {
2045: case MATPRODUCT_AB:
2046: m = A->rmap->n;
2047: n = B->cmap->n;
2048: k = A->cmap->n;
2049: Amat = Acusp->mat;
2050: Bmat = Bcusp->mat;
2051: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2052: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2053: break;
2054: case MATPRODUCT_AtB:
2055: m = A->cmap->n;
2056: n = B->cmap->n;
2057: k = A->rmap->n;
2058: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2059: Amat = Acusp->matTranspose;
2060: Bmat = Bcusp->mat;
2061: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2062: break;
2063: case MATPRODUCT_ABt:
2064: m = A->rmap->n;
2065: n = B->rmap->n;
2066: k = A->cmap->n;
2067: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2068: Amat = Acusp->mat;
2069: Bmat = Bcusp->matTranspose;
2070: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2071: break;
2072: default:
2073: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2074: }
2076: /* create cusparse matrix */
2077: PetscCall(MatSetSizes(C, m, n, m, n));
2078: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2079: c = (Mat_SeqAIJ *)C->data;
2080: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2081: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2082: Ccsr = new CsrMatrix;
2084: c->compressedrow.use = ciscompressed;
2085: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2086: c->compressedrow.nrows = a->compressedrow.nrows;
2087: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
2088: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2089: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2090: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2091: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2092: } else {
2093: c->compressedrow.nrows = 0;
2094: c->compressedrow.i = NULL;
2095: c->compressedrow.rindex = NULL;
2096: Ccusp->workVector = NULL;
2097: Cmat->cprowIndices = NULL;
2098: }
2099: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2100: Ccusp->mat = Cmat;
2101: Ccusp->mat->mat = Ccsr;
2102: Ccsr->num_rows = Ccusp->nrows;
2103: Ccsr->num_cols = n;
2104: Ccsr->row_offsets = new THRUSTINTARRAY(Ccusp->nrows + 1);
2105: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2106: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2107: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2108: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
2109: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
2110: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
2111: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2112: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2113: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2114: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2115: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
2116: c->nz = 0;
2117: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2118: Ccsr->values = new THRUSTARRAY(c->nz);
2119: goto finalizesym;
2120: }
2122: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2123: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2124: Acsr = (CsrMatrix *)Amat->mat;
2125: if (!biscompressed) {
2126: Bcsr = (CsrMatrix *)Bmat->mat;
2127: BmatSpDescr = Bmat->matDescr;
2128: } else { /* we need to use row offsets for the full matrix */
2129: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2130: Bcsr = new CsrMatrix;
2131: Bcsr->num_rows = B->rmap->n;
2132: Bcsr->num_cols = cBcsr->num_cols;
2133: Bcsr->num_entries = cBcsr->num_entries;
2134: Bcsr->column_indices = cBcsr->column_indices;
2135: Bcsr->values = cBcsr->values;
2136: if (!Bcusp->rowoffsets_gpu) {
2137: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
2138: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2139: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2140: }
2141: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2142: mmdata->Bcsr = Bcsr;
2143: if (Bcsr->num_rows && Bcsr->num_cols) {
2144: PetscCallCUSPARSE(cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2145: }
2146: BmatSpDescr = mmdata->matSpBDescr;
2147: }
2148: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2149: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2150: /* precompute flops count */
2151: if (ptype == MATPRODUCT_AB) {
2152: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2153: const PetscInt st = a->i[i];
2154: const PetscInt en = a->i[i + 1];
2155: for (j = st; j < en; j++) {
2156: const PetscInt brow = a->j[j];
2157: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2158: }
2159: }
2160: } else if (ptype == MATPRODUCT_AtB) {
2161: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2162: const PetscInt anzi = a->i[i + 1] - a->i[i];
2163: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2164: flops += (2. * anzi) * bnzi;
2165: }
2166: } else { /* TODO */
2167: flops = 0.;
2168: }
2170: mmdata->flops = flops;
2171: PetscCall(PetscLogGpuTimeBegin());
2173: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2174: // cuda-12.2 requires non-null csrRowOffsets
2175: PetscCallCUSPARSE(cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2176: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2177: // Note that cusparseSpGEMMreuse is deprecated in CUDA 13.2.1
2179: PetscCheck(!PetscDefined(USE_64BIT_INDICES) || PETSC_PKG_CUDA_VERSION_GE(13, 0, 0), PETSC_COMM_SELF, PETSC_ERR_SUP_SYS, "cusparseSpGEMM did not support 64-bit indices before CUDA 13.0. Update your CUDA installation.");
2180: /* ask bufferSize bytes for external memory */
2181: PetscCallCUSPARSE(cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL));
2182: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2183: /* inspect the matrices A and B to understand the memory requirement for the next step */
2184: PetscCallCUSPARSE(cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2));
2185: /* ask bufferSize again bytes for external memory */
2186: PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL));
2187: /* The CUSPARSE documentation is not clear, nor the API
2188: We need both buffers to perform the operations properly!
2189: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2190: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2191: is stored in the descriptor! What a messy API... */
2192: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2193: /* compute the intermediate product of A * B */
2194: PetscCallCUSPARSE(cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer));
2195: /* get matrix C non-zero entries C_nnz1 */
2196: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2197: PetscCall(PetscIntCast(C_nnz1, &c->nz));
2198: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2199: mmdata->mmBufferSize / 1024));
2200: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
2201: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2202: Ccsr->values = new THRUSTARRAY(c->nz);
2203: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2204: if (c->nz) PetscCallCUSPARSE(cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()));
2205: PetscCallCUSPARSE(cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc));
2206: PetscCall(PetscLogGpuFlops(mmdata->flops));
2207: PetscCall(PetscLogGpuTimeEnd());
2208: finalizesym:
2209: c->free_a = PETSC_TRUE;
2210: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
2211: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
2212: c->free_ij = PETSC_TRUE;
2214: PetscInt *d_i = c->i;
2215: if (ciscompressed) d_i = c->compressedrow.i;
2216: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2217: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
2218: if (ciscompressed) { /* need to expand host row offsets */
2219: PetscInt r = 0;
2220: c->i[0] = 0;
2221: for (k = 0; k < c->compressedrow.nrows; k++) {
2222: const PetscInt next = c->compressedrow.rindex[k];
2223: const PetscInt old = c->compressedrow.i[k];
2224: for (; r < next; r++) c->i[r + 1] = old;
2225: }
2226: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2227: }
2228: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
2229: PetscCall(PetscMalloc1(m, &c->ilen));
2230: PetscCall(PetscMalloc1(m, &c->imax));
2231: c->maxnz = c->nz;
2232: c->nonzerorowcnt = 0;
2233: c->rmax = 0;
2234: for (k = 0; k < m; k++) {
2235: const PetscInt nn = c->i[k + 1] - c->i[k];
2236: c->ilen[k] = c->imax[k] = nn;
2237: c->nonzerorowcnt += (PetscInt)!!nn;
2238: c->rmax = PetscMax(c->rmax, nn);
2239: }
2240: PetscCall(PetscMalloc1(c->nz, &c->a));
2241: Ccsr->num_entries = c->nz;
2243: C->nonzerostate++;
2244: PetscCall(PetscLayoutSetUp(C->rmap));
2245: PetscCall(PetscLayoutSetUp(C->cmap));
2246: Ccusp->nonzerostate = C->nonzerostate;
2247: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2248: C->preallocated = PETSC_TRUE;
2249: C->assembled = PETSC_FALSE;
2250: C->was_assembled = PETSC_FALSE;
2251: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2252: mmdata->reusesym = PETSC_TRUE;
2253: C->offloadmask = PETSC_OFFLOAD_GPU;
2254: }
2255: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2256: PetscFunctionReturn(PETSC_SUCCESS);
2257: }
2259: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2261: /* handles sparse or dense B */
2262: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2263: {
2264: Mat_Product *product = mat->product;
2265: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
2267: PetscFunctionBegin;
2268: MatCheckProduct(mat, 1);
2269: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
2270: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
2271: if (product->type == MATPRODUCT_ABC) {
2272: Ciscusp = PETSC_FALSE;
2273: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
2274: }
2275: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2276: PetscBool usecpu = PETSC_FALSE;
2277: switch (product->type) {
2278: case MATPRODUCT_AB:
2279: if (product->api_user) {
2280: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
2281: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2282: PetscOptionsEnd();
2283: } else {
2284: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
2285: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
2286: PetscOptionsEnd();
2287: }
2288: break;
2289: case MATPRODUCT_AtB:
2290: if (product->api_user) {
2291: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
2292: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2293: PetscOptionsEnd();
2294: } else {
2295: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
2296: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
2297: PetscOptionsEnd();
2298: }
2299: break;
2300: case MATPRODUCT_PtAP:
2301: if (product->api_user) {
2302: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
2303: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2304: PetscOptionsEnd();
2305: } else {
2306: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
2307: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
2308: PetscOptionsEnd();
2309: }
2310: break;
2311: case MATPRODUCT_RARt:
2312: if (product->api_user) {
2313: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
2314: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2315: PetscOptionsEnd();
2316: } else {
2317: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
2318: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
2319: PetscOptionsEnd();
2320: }
2321: break;
2322: case MATPRODUCT_ABC:
2323: if (product->api_user) {
2324: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
2325: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2326: PetscOptionsEnd();
2327: } else {
2328: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
2329: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
2330: PetscOptionsEnd();
2331: }
2332: break;
2333: default:
2334: break;
2335: }
2336: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2337: }
2338: /* dispatch */
2339: if (isdense) {
2340: switch (product->type) {
2341: case MATPRODUCT_AB:
2342: case MATPRODUCT_AtB:
2343: case MATPRODUCT_ABt:
2344: case MATPRODUCT_PtAP:
2345: case MATPRODUCT_RARt:
2346: if (product->A->boundtocpu) {
2347: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2348: } else {
2349: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2350: }
2351: break;
2352: case MATPRODUCT_ABC:
2353: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2354: break;
2355: default:
2356: break;
2357: }
2358: } else if (Biscusp && Ciscusp) {
2359: switch (product->type) {
2360: case MATPRODUCT_AB:
2361: case MATPRODUCT_AtB:
2362: case MATPRODUCT_ABt:
2363: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2364: break;
2365: case MATPRODUCT_PtAP:
2366: case MATPRODUCT_RARt:
2367: case MATPRODUCT_ABC:
2368: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2369: break;
2370: default:
2371: break;
2372: }
2373: } else { /* fallback for AIJ */
2374: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2375: }
2376: PetscFunctionReturn(PETSC_SUCCESS);
2377: }
2379: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2380: {
2381: PetscFunctionBegin;
2382: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
2383: PetscFunctionReturn(PETSC_SUCCESS);
2384: }
2386: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2387: {
2388: PetscFunctionBegin;
2389: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
2390: PetscFunctionReturn(PETSC_SUCCESS);
2391: }
2393: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2394: {
2395: PetscFunctionBegin;
2396: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
2397: PetscFunctionReturn(PETSC_SUCCESS);
2398: }
2400: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2401: {
2402: PetscFunctionBegin;
2403: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
2404: PetscFunctionReturn(PETSC_SUCCESS);
2405: }
2407: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
2408: {
2409: PetscFunctionBegin;
2410: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
2411: PetscFunctionReturn(PETSC_SUCCESS);
2412: }
2414: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
2415: {
2416: int i = blockIdx.x * blockDim.x + threadIdx.x;
2417: if (i < n) y[idx[i]] += x[i];
2418: }
2420: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2421: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
2422: {
2423: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2424: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2425: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2426: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
2427: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2428: PetscBool compressed;
2429: PetscInt nx, ny;
2431: PetscFunctionBegin;
2432: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
2433: if (!a->nz) {
2434: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
2435: else PetscCall(VecSeq_CUDA::Set(zz, 0));
2436: PetscFunctionReturn(PETSC_SUCCESS);
2437: }
2438: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2439: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2440: if (!trans) {
2441: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2442: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2443: } else {
2444: if (herm || !A->form_explicit_transpose) {
2445: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2446: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
2447: } else {
2448: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2449: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
2450: }
2451: }
2452: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2453: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2455: try {
2456: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
2457: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
2458: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
2460: PetscCall(PetscLogGpuTimeBegin());
2461: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2462: /* z = A x + beta y.
2463: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2464: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2465: */
2466: xptr = xarray;
2467: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2468: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2469: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2470: allocated to accommodate different uses. So we get the length info directly from mat.
2471: */
2472: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2473: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2474: nx = mat->num_cols; // since y = Ax
2475: ny = mat->num_rows;
2476: }
2477: } else {
2478: /* z = A^T x + beta y
2479: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2480: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2481: */
2482: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2483: dptr = zarray;
2484: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2485: if (compressed) { /* Scatter x to work vector */
2486: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2488: thrust::for_each(
2489: #if PetscDefined(HAVE_THRUST_ASYNC)
2490: thrust::cuda::par.on(PetscDefaultCudaStream),
2491: #endif
2492: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2493: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
2494: }
2495: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2496: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2497: nx = mat->num_rows; // since y = A^T x
2498: ny = mat->num_cols;
2499: }
2500: }
2502: /* csr_spmv does y = alpha op(A) x + beta y */
2503: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2504: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2505: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
2506: #else
2507: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
2508: #endif
2510: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2511: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2512: if (!matDescr) {
2513: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
2514: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2515: }
2516: #endif
2518: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2519: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
2520: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
2521: PetscCallCUSPARSE(
2522: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
2523: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
2524: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
2525: PetscCallCUSPARSE(
2526: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2527: #endif
2528: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2529: } else {
2530: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2531: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
2532: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
2533: }
2535: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
2537: } else {
2538: if (cusparsestruct->nrows) {
2539: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2540: }
2541: }
2542: PetscCall(PetscLogGpuTimeEnd());
2544: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2545: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2546: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2547: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
2548: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2549: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2550: }
2551: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2552: PetscCall(VecSeq_CUDA::Set(zz, 0));
2553: }
2555: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2556: if (compressed) {
2557: PetscCall(PetscLogGpuTimeBegin());
2558: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
2559: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
2560: PetscCall(PetscLogGpuTimeEnd());
2561: }
2562: } else {
2563: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
2564: }
2565: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
2566: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
2567: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
2568: } catch (char *ex) {
2569: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2570: }
2571: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
2572: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
2573: PetscFunctionReturn(PETSC_SUCCESS);
2574: }
2576: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
2577: {
2578: PetscFunctionBegin;
2579: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
2580: PetscFunctionReturn(PETSC_SUCCESS);
2581: }
2583: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
2584: {
2585: PetscFunctionBegin;
2586: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetDiagonal(A, diag));
2587: PetscFunctionReturn(PETSC_SUCCESS);
2588: }
2590: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec ll, Vec rr)
2591: {
2592: PetscFunctionBegin;
2593: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::DiagonalScale(A, ll, rr));
2594: PetscFunctionReturn(PETSC_SUCCESS);
2595: }
2597: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
2598: {
2599: PetscFunctionBegin;
2600: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode));
2601: PetscFunctionReturn(PETSC_SUCCESS);
2602: }
2604: /*@
2605: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
2607: Collective
2609: Input Parameters:
2610: + comm - MPI communicator, set to `PETSC_COMM_SELF`
2611: . m - number of rows
2612: . n - number of columns
2613: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
2614: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
2616: Output Parameter:
2617: . A - the matrix
2619: Level: intermediate
2621: Notes:
2622: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
2623: calculations. For good matrix assembly performance the user should preallocate the matrix
2624: storage by setting the parameter `nz` (or the array `nnz`).
2626: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2627: MatXXXXSetPreallocation() paradgm instead of this routine directly.
2628: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
2630: The AIJ format, also called
2631: compressed row storage, is fully compatible with standard Fortran
2632: storage. That is, the stored row and column indices can begin at
2633: either one (as in Fortran) or zero.
2635: Specify the preallocated storage with either nz or nnz (not both).
2636: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
2637: allocation.
2639: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
2641: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
2642: `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
2643: @*/
2644: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
2645: {
2646: return MatSeqAIJCUSPARSE_CUPM_t::CreateSeqAIJ(comm, m, n, nz, nnz, A);
2647: }
2649: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2650: {
2651: return MatSeqAIJCUSPARSE_CUPM_t::Destroy(A);
2652: }
2654: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
2655: {
2656: PetscFunctionBegin;
2657: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Duplicate(A, cpvalues, B));
2658: PetscFunctionReturn(PETSC_SUCCESS);
2659: }
2661: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
2662: {
2663: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
2664: Mat_SeqAIJCUSPARSE *cy;
2665: Mat_SeqAIJCUSPARSE *cx;
2666: CsrMatrix *csry, *csrx;
2668: PetscFunctionBegin;
2669: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
2670: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
2671: if (X->ops->axpy != Y->ops->axpy) {
2672: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2673: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2674: PetscFunctionReturn(PETSC_SUCCESS);
2675: }
2676: /* if we are here, it means both matrices are bound to GPU */
2677: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
2678: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
2679: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2680: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
2681: csry = (CsrMatrix *)cy->mat->mat;
2682: csrx = (CsrMatrix *)cx->mat->mat;
2683: /* see if we can turn this into a cublas axpy */
2684: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
2685: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
2686: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
2687: if (eq) str = SAME_NONZERO_PATTERN;
2688: }
2689: /* spgeam is buggy with one column */
2690: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
2692: #if !defined(PETSC_USE_64BIT_INDICES) // cusparseScsrgeam2 etc. do not support 64bit indices
2693: if (str == SUBSET_NONZERO_PATTERN) {
2694: PetscScalar *ay, b = 1.0;
2695: const PetscScalar *ax;
2696: size_t bufferSize;
2697: void *buffer;
2699: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
2700: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
2701: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
2702: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2703: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
2704: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
2705: PetscCall(PetscLogGpuTimeBegin());
2706: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
2707: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
2708: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
2709: PetscCall(PetscLogGpuTimeEnd());
2710: PetscCallCUDA(cudaFree(buffer));
2712: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
2713: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
2714: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
2715: } else
2716: #endif
2717: if (str == SAME_NONZERO_PATTERN) {
2718: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::AXPY_SameNZ(Y, a, X));
2719: } else {
2720: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
2721: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
2722: }
2723: PetscFunctionReturn(PETSC_SUCCESS);
2724: }
2726: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
2727: {
2728: PetscFunctionBegin;
2729: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::Scale(Y, a));
2730: PetscFunctionReturn(PETSC_SUCCESS);
2731: }
2733: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2734: {
2735: PetscFunctionBegin;
2736: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::ZeroEntries(A));
2737: PetscFunctionReturn(PETSC_SUCCESS);
2738: }
2740: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(Mat A, PetscMemType *m)
2741: {
2742: PetscFunctionBegin;
2743: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetCurrentMemType(A, m));
2744: PetscFunctionReturn(PETSC_SUCCESS);
2745: }
2747: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
2748: {
2749: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2751: PetscFunctionBegin;
2752: if (A->factortype != MAT_FACTOR_NONE) {
2753: A->boundtocpu = flg;
2754: PetscFunctionReturn(PETSC_SUCCESS);
2755: }
2756: if (flg) {
2757: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2759: A->ops->scale = MatScale_SeqAIJ;
2760: A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
2761: A->ops->diagonalscale = MatDiagonalScale_SeqAIJ;
2762: A->ops->axpy = MatAXPY_SeqAIJ;
2763: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
2764: A->ops->mult = MatMult_SeqAIJ;
2765: A->ops->multadd = MatMultAdd_SeqAIJ;
2766: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2767: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2768: A->ops->multhermitiantranspose = NULL;
2769: A->ops->multhermitiantransposeadd = NULL;
2770: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
2771: A->ops->getcurrentmemtype = NULL;
2772: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
2773: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
2774: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
2775: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
2776: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
2777: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
2778: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
2779: } else {
2780: A->ops->scale = MatScale_SeqAIJCUSPARSE;
2781: A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE;
2782: A->ops->diagonalscale = MatDiagonalScale_SeqAIJCUSPARSE;
2783: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
2784: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
2785: A->ops->mult = MatMult_SeqAIJCUSPARSE;
2786: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
2787: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
2788: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2789: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2790: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2791: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
2792: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
2793: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
2794: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
2795: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
2796: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
2797: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
2798: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
2799: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
2801: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
2802: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2803: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2804: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
2805: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
2806: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
2807: }
2808: A->boundtocpu = flg;
2809: a->inode.use = (flg && a->inode.size_csr) ? PETSC_TRUE : PETSC_FALSE;
2810: PetscFunctionReturn(PETSC_SUCCESS);
2811: }
2813: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
2814: {
2815: Mat B;
2817: PetscFunctionBegin;
2818: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
2819: if (reuse == MAT_INITIAL_MATRIX) {
2820: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
2821: } else if (reuse == MAT_REUSE_MATRIX) {
2822: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
2823: }
2824: B = *newmat;
2826: PetscCall(PetscFree(B->defaultvectype));
2827: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
2829: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2830: if (B->factortype == MAT_FACTOR_NONE) {
2831: Mat_SeqAIJCUSPARSE *spptr;
2832: PetscCall(PetscNew(&spptr));
2833: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2834: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2835: spptr->format = MAT_CUSPARSE_CSR;
2836: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
2837: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
2838: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
2839: B->spptr = spptr;
2840: } else {
2841: Mat_SeqAIJCUSPARSETriFactors *spptr;
2843: PetscCall(PetscNew(&spptr));
2844: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
2845: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
2846: B->spptr = spptr;
2847: }
2848: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2849: }
2850: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2851: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2852: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
2853: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2854: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2855: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2856: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
2858: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
2859: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
2860: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
2861: #if defined(PETSC_HAVE_HYPRE)
2862: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
2863: #endif
2864: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
2865: PetscFunctionReturn(PETSC_SUCCESS);
2866: }
2868: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2869: {
2870: PetscFunctionBegin;
2871: PetscCall(MatCreate_SeqAIJ(B));
2872: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
2873: PetscFunctionReturn(PETSC_SUCCESS);
2874: }
2876: /*MC
2877: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
2879: Options Database Keys:
2880: + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
2881: . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
2882: Other options include ell (ellpack) or hyb (hybrid).
2883: . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
2884: - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
2886: Level: beginner
2888: Notes:
2889: These matrices can be in either CSR, ELL, or HYB format.
2891: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
2893: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
2894: if some integer values passed in do not fit in `int`.
2896: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2897: M*/
2899: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2900: {
2901: PetscFunctionBegin;
2902: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
2903: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
2904: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
2905: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
2906: PetscFunctionReturn(PETSC_SUCCESS);
2907: }
2909: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
2910: {
2911: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
2913: PetscFunctionBegin;
2914: if (cusp) {
2915: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
2916: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
2917: delete cusp->workVector;
2918: delete cusp->rowoffsets_gpu;
2919: delete cusp->csr2csc_i;
2920: delete cusp->coords;
2921: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
2922: PetscCall(PetscFree(mat->spptr));
2923: }
2924: PetscFunctionReturn(PETSC_SUCCESS);
2925: }
2927: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2928: {
2929: PetscFunctionBegin;
2930: if (*mat) {
2931: delete (*mat)->values;
2932: delete (*mat)->column_indices;
2933: delete (*mat)->row_offsets;
2934: delete *mat;
2935: *mat = 0;
2936: }
2937: PetscFunctionReturn(PETSC_SUCCESS);
2938: }
2940: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
2941: {
2942: CsrMatrix *mat;
2944: PetscFunctionBegin;
2945: if (*matstruct) {
2946: if ((*matstruct)->mat) {
2947: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
2948: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2949: } else {
2950: mat = (CsrMatrix *)(*matstruct)->mat;
2951: PetscCall(CsrMatrix_Destroy(&mat));
2952: }
2953: }
2954: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
2955: delete (*matstruct)->cprowIndices;
2956: PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
2957: PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
2958: PetscCallCUDA(cudaFree((*matstruct)->beta_one));
2960: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2961: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
2963: for (int i = 0; i < 3; i++) {
2964: if (mdata->cuSpMV[i].initialized) {
2965: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
2966: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
2967: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
2968: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2969: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
2970: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
2971: #endif
2972: }
2973: }
2974: delete *matstruct;
2975: *matstruct = NULL;
2976: }
2977: PetscFunctionReturn(PETSC_SUCCESS);
2978: }
2980: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
2981: {
2982: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
2984: PetscFunctionBegin;
2985: if (fs) {
2986: delete fs->rpermIndices;
2987: delete fs->cpermIndices;
2988: fs->rpermIndices = NULL;
2989: fs->cpermIndices = NULL;
2990: fs->init_dev_prop = PETSC_FALSE;
2991: PetscCallCUDA(cudaFree(fs->csrRowPtr));
2992: PetscCallCUDA(cudaFree(fs->csrColIdx));
2993: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
2994: PetscCallCUDA(cudaFree(fs->csrColIdx32));
2995: PetscCallCUDA(cudaFree(fs->csrVal));
2996: PetscCallCUDA(cudaFree(fs->diag));
2997: PetscCallCUDA(cudaFree(fs->X));
2998: PetscCallCUDA(cudaFree(fs->Y));
2999: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3000: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3001: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
3002: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3003: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3004: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3005: if (fs->spMatDescr_L) PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3006: if (fs->spMatDescr_U) PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3007: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3008: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3009: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3010: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3011: if (fs->dnVecDescr_X) PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3012: if (fs->dnVecDescr_Y) PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3013: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3014: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
3015: PetscCall(PetscFree(fs->csrRowPtr_h));
3016: PetscCall(PetscFree(fs->csrVal_h));
3017: PetscCall(PetscFree(fs->diag_h));
3018: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3019: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3020: }
3021: PetscFunctionReturn(PETSC_SUCCESS);
3022: }
3024: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3025: {
3026: PetscFunctionBegin;
3027: if (*trifactors) {
3028: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3029: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
3030: PetscCall(PetscFree(*trifactors));
3031: }
3032: PetscFunctionReturn(PETSC_SUCCESS);
3033: }
3035: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3036: {
3037: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3039: PetscFunctionBegin;
3040: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3041: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3042: if (destroy) {
3043: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
3044: delete cusp->csr2csc_i;
3045: cusp->csr2csc_i = NULL;
3046: }
3047: A->transupdated = PETSC_FALSE;
3048: PetscFunctionReturn(PETSC_SUCCESS);
3049: }
3051: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
3052: {
3053: PetscFunctionBegin;
3054: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j));
3055: PetscFunctionReturn(PETSC_SUCCESS);
3056: }
3058: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3059: {
3060: PetscFunctionBegin;
3061: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::SetValuesCOO(A, v, imode));
3062: PetscFunctionReturn(PETSC_SUCCESS);
3063: }
3065: /*@C
3066: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
3068: Not Collective
3070: Input Parameters:
3071: + A - the matrix
3072: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3074: Output Parameters:
3075: + i - the CSR row pointers
3076: - j - the CSR column indices
3078: Level: developer
3080: Note:
3081: When compressed is true, the CSR structure does not contain empty rows
3083: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
3084: @*/
3085: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3086: {
3087: PetscFunctionBegin;
3088: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::GetIJ(A, compressed, i, j));
3089: PetscFunctionReturn(PETSC_SUCCESS);
3090: }
3092: /*@C
3093: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
3095: Not Collective
3097: Input Parameters:
3098: + A - the matrix
3099: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
3100: . i - the CSR row pointers
3101: - j - the CSR column indices
3103: Level: developer
3105: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
3106: @*/
3107: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const PetscInt *i[], const PetscInt *j[])
3108: {
3109: PetscFunctionBegin;
3110: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::RestoreIJ(A, compressed, i, j));
3111: PetscFunctionReturn(PETSC_SUCCESS);
3112: }
3114: /*@C
3115: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
3117: Not Collective
3119: Input Parameter:
3120: . A - a `MATSEQAIJCUSPARSE` matrix
3122: Output Parameter:
3123: . a - pointer to the device data
3125: Level: developer
3127: Note:
3128: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
3130: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
3131: @*/
3132: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
3133: {
3134: return MatSeqAIJCUSPARSE_CUPM_t::GetArrayRead(A, a);
3135: }
3137: /*@C
3138: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
3140: Not Collective
3142: Input Parameters:
3143: + A - a `MATSEQAIJCUSPARSE` matrix
3144: - a - pointer to the device data
3146: Level: developer
3148: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
3149: @*/
3150: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
3151: {
3152: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayRead(A, a);
3153: }
3155: /*@C
3156: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
3158: Not Collective
3160: Input Parameter:
3161: . A - a `MATSEQAIJCUSPARSE` matrix
3163: Output Parameter:
3164: . a - pointer to the device data
3166: Level: developer
3168: Note:
3169: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
3171: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
3172: @*/
3173: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
3174: {
3175: return MatSeqAIJCUSPARSE_CUPM_t::GetArray(A, a);
3176: }
3177: /*@C
3178: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
3180: Not Collective
3182: Input Parameters:
3183: + A - a `MATSEQAIJCUSPARSE` matrix
3184: - a - pointer to the device data
3186: Level: developer
3188: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
3189: @*/
3190: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
3191: {
3192: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArray(A, a);
3193: }
3195: /*@C
3196: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
3198: Not Collective
3200: Input Parameter:
3201: . A - a `MATSEQAIJCUSPARSE` matrix
3203: Output Parameter:
3204: . a - pointer to the device data
3206: Level: developer
3208: Note:
3209: Does not trigger any host to device copies.
3211: It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
3213: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
3214: @*/
3215: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
3216: {
3217: return MatSeqAIJCUSPARSE_CUPM_t::GetArrayWrite(A, a);
3218: }
3220: /*@C
3221: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
3223: Not Collective
3225: Input Parameters:
3226: + A - a `MATSEQAIJCUSPARSE` matrix
3227: - a - pointer to the device data
3229: Level: developer
3231: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
3232: @*/
3233: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
3234: {
3235: return MatSeqAIJCUSPARSE_CUPM_t::RestoreArrayWrite(A, a);
3236: }
3238: struct IJCompare4 {
3239: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt, PetscScalar, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt, PetscScalar, PetscInt> &t2)
3240: {
3241: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
3242: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
3243: return false;
3244: }
3245: };
3247: struct Shift {
3248: PetscInt _shift;
3250: Shift(PetscInt shift) : _shift(shift) { }
3251: __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c + _shift; }
3252: };
3254: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
3255: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
3256: {
3257: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
3258: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
3259: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3260: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3261: PetscInt Annz, Bnnz;
3262: PetscInt i, m, n, zero = 0;
3264: PetscFunctionBegin;
3267: PetscAssertPointer(C, 4);
3268: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
3269: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
3270: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
3271: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
3272: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3273: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3274: if (reuse == MAT_INITIAL_MATRIX) {
3275: m = A->rmap->n;
3276: n = A->cmap->n + B->cmap->n;
3277: PetscCall(MatCreate(PETSC_COMM_SELF, C));
3278: PetscCall(MatSetSizes(*C, m, n, m, n));
3279: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
3280: c = (Mat_SeqAIJ *)(*C)->data;
3281: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3282: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3283: Ccsr = new CsrMatrix;
3284: Cmat->cprowIndices = NULL;
3285: c->compressedrow.use = PETSC_FALSE;
3286: c->compressedrow.nrows = 0;
3287: c->compressedrow.i = NULL;
3288: c->compressedrow.rindex = NULL;
3289: Ccusp->workVector = NULL;
3290: Ccusp->nrows = m;
3291: Ccusp->mat = Cmat;
3292: Ccusp->mat->mat = Ccsr;
3293: Ccsr->num_rows = m;
3294: Ccsr->num_cols = n;
3295: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3296: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3297: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3298: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3299: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3300: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3301: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3302: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3303: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3304: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3305: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3306: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3307: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3309: Acsr = (CsrMatrix *)Acusp->mat->mat;
3310: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3311: Annz = (PetscInt)Acsr->column_indices->size();
3312: Bnnz = (PetscInt)Bcsr->column_indices->size();
3313: c->nz = Annz + Bnnz;
3314: Ccsr->row_offsets = new THRUSTINTARRAY(m + 1);
3315: Ccsr->column_indices = new THRUSTINTARRAY(c->nz);
3316: Ccsr->values = new THRUSTARRAY(c->nz);
3317: Ccsr->num_entries = c->nz;
3318: Ccusp->coords = new THRUSTINTARRAY(c->nz);
3319: if (c->nz) {
3320: auto Acoo = new THRUSTINTARRAY(Annz); // initialized with zeros
3321: auto Bcoo = new THRUSTINTARRAY(Bnnz);
3322: auto Ccoo = new THRUSTINTARRAY(c->nz);
3323: THRUSTINTARRAY *Aroff, *Broff;
3325: if (a->compressedrow.use) { /* need full row offset */
3326: if (!Acusp->rowoffsets_gpu) {
3327: Acusp->rowoffsets_gpu = new THRUSTINTARRAY(A->rmap->n + 1);
3328: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
3329: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
3330: }
3331: Aroff = Acusp->rowoffsets_gpu;
3332: } else Aroff = Acsr->row_offsets;
3333: if (b->compressedrow.use) { /* need full row offset */
3334: if (!Bcusp->rowoffsets_gpu) {
3335: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY(B->rmap->n + 1);
3336: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3337: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3338: }
3339: Broff = Bcusp->rowoffsets_gpu;
3340: } else Broff = Bcsr->row_offsets;
3341: PetscCall(PetscLogGpuTimeBegin());
3342: // Implement cusparseXcsr2coo() with Thrust, as the former doesn't support 64-bit indices.
3343: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Aroff->data().get(), Acoo->data().get())));
3344: PetscCallThrust(thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m), Csr2coo(Broff->data().get(), Bcoo->data().get())));
3346: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3347: auto Aperm = thrust::make_constant_iterator(1);
3348: auto Bperm = thrust::make_constant_iterator(0);
3349: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
3350: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
3351: auto wPerm = new THRUSTINTARRAY(Annz + Bnnz);
3352: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
3353: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
3354: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); // Use B column indices shifted by A->cmap->n
3355: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
3356: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
3357: auto p1 = Ccusp->coords->begin();
3358: auto p2 = Ccusp->coords->begin();
3359: #if CCCL_VERSION >= 3001000
3360: cuda::std::advance(p2, Annz);
3361: #else
3362: thrust::advance(p2, Annz);
3363: #endif
3364: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); // put nonzeros in A and B to C in sorted order (by row and then by column)
3365: auto cci = thrust::make_counting_iterator(zero);
3366: auto cce = thrust::make_counting_iterator(c->nz);
3367: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
3368: auto pred = thrust::identity<int>();
3369: #else
3370: auto pred = cuda::std::identity();
3371: #endif
3372: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
3373: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
3374: // Implement a simplified cusparseXcoo2csr() with Thrust (assuming the row indices are already sorted), as the former doesn't support 64-bit indices.
3375: PetscCallThrust(thrust::lower_bound(thrust::device, Ccoo->begin(), Ccoo->end(), thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(m + 1), Ccsr->row_offsets->begin()));
3376: PetscCall(PetscLogGpuTimeEnd());
3377: delete wPerm;
3378: delete Acoo;
3379: delete Bcoo;
3380: delete Ccoo;
3381: PetscCallCUSPARSE(cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3382: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
3383: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3384: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3385: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3386: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3387: CsrMatrix *CcsrT = new CsrMatrix;
3388: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3389: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3391: (*C)->form_explicit_transpose = PETSC_TRUE;
3392: (*C)->transupdated = PETSC_TRUE;
3393: Ccusp->rowoffsets_gpu = NULL;
3394: CmatT->cprowIndices = NULL;
3395: CmatT->mat = CcsrT;
3396: CcsrT->num_rows = n;
3397: CcsrT->num_cols = m;
3398: CcsrT->num_entries = c->nz;
3400: CcsrT->row_offsets = new THRUSTINTARRAY(n + 1);
3401: CcsrT->column_indices = new THRUSTINTARRAY(c->nz);
3402: CcsrT->values = new THRUSTARRAY(c->nz);
3404: PetscCall(PetscLogGpuTimeBegin());
3405: auto rT = CcsrT->row_offsets->begin();
3406: if (AT) {
3407: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
3408: #if CCCL_VERSION >= 3001000
3409: cuda::std::advance(rT, -1);
3410: #else
3411: thrust::advance(rT, -1);
3412: #endif
3413: }
3414: if (BT) {
3415: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
3416: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
3417: thrust::copy(titb, tite, rT);
3418: }
3419: auto cT = CcsrT->column_indices->begin();
3420: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
3421: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
3422: auto vT = CcsrT->values->begin();
3423: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3424: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3425: PetscCall(PetscLogGpuTimeEnd());
3427: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
3428: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
3429: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3430: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
3431: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
3432: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
3433: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3434: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3435: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3436: PetscCallCUSPARSE(cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), csrRowOffsetsType, csrColIndType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3437: Ccusp->matTranspose = CmatT;
3438: }
3439: }
3441: c->free_a = PETSC_TRUE;
3442: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3443: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3444: c->free_ij = PETSC_TRUE;
3445: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3446: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3447: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3448: PetscCall(PetscMalloc1(m, &c->ilen));
3449: PetscCall(PetscMalloc1(m, &c->imax));
3450: c->maxnz = c->nz;
3451: c->nonzerorowcnt = 0;
3452: c->rmax = 0;
3453: for (i = 0; i < m; i++) {
3454: const PetscInt nn = c->i[i + 1] - c->i[i];
3455: c->ilen[i] = c->imax[i] = nn;
3456: c->nonzerorowcnt += (PetscInt)!!nn;
3457: c->rmax = PetscMax(c->rmax, nn);
3458: }
3459: PetscCall(PetscMalloc1(c->nz, &c->a));
3460: (*C)->nonzerostate++;
3461: PetscCall(PetscLayoutSetUp((*C)->rmap));
3462: PetscCall(PetscLayoutSetUp((*C)->cmap));
3463: Ccusp->nonzerostate = (*C)->nonzerostate;
3464: (*C)->preallocated = PETSC_TRUE;
3465: } else {
3466: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
3467: c = (Mat_SeqAIJ *)(*C)->data;
3468: if (c->nz) {
3469: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
3470: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
3471: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
3472: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
3473: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3474: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3475: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3476: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
3477: Acsr = (CsrMatrix *)Acusp->mat->mat;
3478: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
3479: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
3480: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
3481: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
3482: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
3483: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
3484: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
3485: auto pmid = Ccusp->coords->begin();
3486: #if CCCL_VERSION >= 3001000
3487: cuda::std::advance(pmid, Acsr->num_entries);
3488: #else
3489: thrust::advance(pmid, Acsr->num_entries);
3490: #endif
3491: PetscCall(PetscLogGpuTimeBegin());
3492: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
3493: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3494: thrust::for_each(zibait, zieait, VecCUDAEquals());
3495: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
3496: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
3497: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
3498: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
3499: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
3500: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3501: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3502: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
3503: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
3504: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
3505: auto vT = CcsrT->values->begin();
3506: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
3507: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
3508: (*C)->transupdated = PETSC_TRUE;
3509: }
3510: PetscCall(PetscLogGpuTimeEnd());
3511: }
3512: }
3513: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
3514: (*C)->assembled = PETSC_TRUE;
3515: (*C)->was_assembled = PETSC_FALSE;
3516: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
3517: PetscFunctionReturn(PETSC_SUCCESS);
3518: }
3520: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
3521: {
3522: PetscFunctionBegin;
3523: PetscCall(MatSeqAIJCUSPARSE_CUPM_t::CopySubArray(A, n, idx, v));
3524: PetscFunctionReturn(PETSC_SUCCESS);
3525: }