Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #endif
19: #include <thrust/iterator/constant_iterator.h>
20: #include <thrust/remove.h>
21: #include <thrust/sort.h>
22: #include <thrust/unique.h>
23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24: #include <cuda/std/functional>
25: #endif
27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
30: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32: typedef enum {
33: CUSPARSE_MV_ALG_DEFAULT = 0,
34: CUSPARSE_COOMV_ALG = 1,
35: CUSPARSE_CSRMV_ALG1 = 2,
36: CUSPARSE_CSRMV_ALG2 = 3
37: } cusparseSpMVAlg_t;
39: typedef enum {
40: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
41: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
42: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
43: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
44: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
45: CUSPARSE_SPMM_ALG_DEFAULT = 0,
46: CUSPARSE_SPMM_COO_ALG1 = 1,
47: CUSPARSE_SPMM_COO_ALG2 = 2,
48: CUSPARSE_SPMM_COO_ALG3 = 3,
49: CUSPARSE_SPMM_COO_ALG4 = 5,
50: CUSPARSE_SPMM_CSR_ALG1 = 4,
51: CUSPARSE_SPMM_CSR_ALG2 = 6,
52: } cusparseSpMMAlg_t;
54: typedef enum {
55: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
56: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
57: } cusparseCsr2CscAlg_t;
58: */
59: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
60: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
61: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
62: #endif
64: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
66: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
67: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
69: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
70: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
74: #endif
75: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
76: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
86: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
91: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
94: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99: {
100: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
102: PetscFunctionBegin;
103: switch (op) {
104: case MAT_CUSPARSE_MULT:
105: cusparsestruct->format = format;
106: break;
107: case MAT_CUSPARSE_ALL:
108: cusparsestruct->format = format;
109: break;
110: default:
111: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112: }
113: PetscFunctionReturn(PETSC_SUCCESS);
114: }
116: /*@
117: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118: operation. Only the `MatMult()` operation can use different GPU storage formats
120: Not Collective
122: Input Parameters:
123: + A - Matrix of type `MATSEQAIJCUSPARSE`
124: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
125: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128: Level: intermediate
130: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131: @*/
132: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133: {
134: PetscFunctionBegin;
136: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141: {
142: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
144: PetscFunctionBegin;
145: cusparsestruct->use_cpu_solve = use_cpu;
146: PetscFunctionReturn(PETSC_SUCCESS);
147: }
149: /*@
150: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
152: Input Parameters:
153: + A - Matrix of type `MATSEQAIJCUSPARSE`
154: - use_cpu - set flag for using the built-in CPU `MatSolve()`
156: Level: intermediate
158: Note:
159: The cuSparse LU solver currently computes the factors with the built-in CPU method
160: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
163: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164: @*/
165: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166: {
167: PetscFunctionBegin;
169: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
170: PetscFunctionReturn(PETSC_SUCCESS);
171: }
173: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174: {
175: PetscFunctionBegin;
176: switch (op) {
177: case MAT_FORM_EXPLICIT_TRANSPOSE:
178: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
180: A->form_explicit_transpose = flg;
181: break;
182: default:
183: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184: break;
185: }
186: PetscFunctionReturn(PETSC_SUCCESS);
187: }
189: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
190: {
191: MatCUSPARSEStorageFormat format;
192: PetscBool flg;
193: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
195: PetscFunctionBegin;
196: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
197: if (A->factortype == MAT_FACTOR_NONE) {
198: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
199: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
201: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
202: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
203: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
204: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
205: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
206: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
207: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
208: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
209: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210: #else
211: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
212: #endif
213: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
214: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
216: PetscCall(
217: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
218: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
219: #endif
220: }
221: PetscOptionsHeadEnd();
222: PetscFunctionReturn(PETSC_SUCCESS);
223: }
225: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
226: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
227: {
228: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
229: PetscInt m = A->rmap->n;
230: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
231: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
232: const MatScalar *Aa = a->a;
233: PetscInt *Mi, *Mj, Mnz;
234: PetscScalar *Ma;
236: PetscFunctionBegin;
237: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
238: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
239: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
240: Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
241: PetscCall(PetscMalloc1(m + 1, &Mi));
242: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
243: PetscCall(PetscMalloc1(Mnz, &Ma));
244: Mi[0] = 0;
245: for (PetscInt i = 0; i < m; i++) {
246: PetscInt llen = Ai[i + 1] - Ai[i];
247: PetscInt ulen = Adiag[i] - Adiag[i + 1];
248: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
249: Mj[Mi[i] + llen] = i; // diagonal entry
250: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
251: Mi[i + 1] = Mi[i] + llen + ulen;
252: }
253: // Copy M (L,U) from host to device
254: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
255: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
256: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
257: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
258: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
260: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
261: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
262: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
263: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
264: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
265: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
266: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
267: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
269: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
270: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
271: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
273: fillMode = CUSPARSE_FILL_MODE_UPPER;
274: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
275: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
276: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
277: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
279: // Allocate work vectors in SpSv
280: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
281: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
283: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
284: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
286: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
287: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
288: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
289: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
290: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
291: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
292: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
294: // Record for reuse
295: fs->csrRowPtr_h = Mi;
296: fs->csrVal_h = Ma;
297: PetscCall(PetscFree(Mj));
298: }
299: // Copy the value
300: Mi = fs->csrRowPtr_h;
301: Ma = fs->csrVal_h;
302: Mnz = Mi[m];
303: for (PetscInt i = 0; i < m; i++) {
304: PetscInt llen = Ai[i + 1] - Ai[i];
305: PetscInt ulen = Adiag[i] - Adiag[i + 1];
306: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
307: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry
308: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
309: }
310: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
312: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
313: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
314: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
315: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317: } else
318: #endif
319: {
320: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
321: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
323: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
324: fs->updatedSpSVAnalysis = PETSC_TRUE;
325: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
326: }
327: }
328: PetscFunctionReturn(PETSC_SUCCESS);
329: }
330: #else
331: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
332: {
333: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
334: PetscInt n = A->rmap->n;
335: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
336: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
337: const PetscInt *ai = a->i, *aj = a->j, *vi;
338: const MatScalar *aa = a->a, *v;
339: PetscInt *AiLo, *AjLo;
340: PetscInt i, nz, nzLower, offset, rowOffset;
342: PetscFunctionBegin;
343: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
344: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
345: try {
346: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
347: nzLower = n + ai[n] - ai[1];
348: if (!loTriFactor) {
349: PetscScalar *AALo;
351: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
353: /* Allocate Space for the lower triangular matrix */
354: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
355: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
357: /* Fill the lower triangular matrix */
358: AiLo[0] = (PetscInt)0;
359: AiLo[n] = nzLower;
360: AjLo[0] = (PetscInt)0;
361: AALo[0] = (MatScalar)1.0;
362: v = aa;
363: vi = aj;
364: offset = 1;
365: rowOffset = 1;
366: for (i = 1; i < n; i++) {
367: nz = ai[i + 1] - ai[i];
368: /* additional 1 for the term on the diagonal */
369: AiLo[i] = rowOffset;
370: rowOffset += nz + 1;
372: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
373: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
375: offset += nz;
376: AjLo[offset] = (PetscInt)i;
377: AALo[offset] = (MatScalar)1.0;
378: offset += 1;
380: v += nz;
381: vi += nz;
382: }
384: /* allocate space for the triangular factor information */
385: PetscCall(PetscNew(&loTriFactor));
386: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
387: /* Create the matrix description */
388: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
389: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
390: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
391: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
392: #else
393: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
394: #endif
395: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
396: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
398: /* set the operation */
399: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401: /* set the matrix */
402: loTriFactor->csrMat = new CsrMatrix;
403: loTriFactor->csrMat->num_rows = n;
404: loTriFactor->csrMat->num_cols = n;
405: loTriFactor->csrMat->num_entries = nzLower;
407: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
408: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
410: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
411: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
413: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
414: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
416: /* Create the solve analysis information */
417: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
418: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
419: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
420: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
421: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
422: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
423: #endif
425: /* perform the solve analysis */
426: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
427: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
428: PetscCallCUDA(WaitForCUDA());
429: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
431: /* assign the pointer */
432: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
433: loTriFactor->AA_h = AALo;
434: PetscCallCUDA(cudaFreeHost(AiLo));
435: PetscCallCUDA(cudaFreeHost(AjLo));
436: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
437: } else { /* update values only */
438: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
439: /* Fill the lower triangular matrix */
440: loTriFactor->AA_h[0] = 1.0;
441: v = aa;
442: vi = aj;
443: offset = 1;
444: for (i = 1; i < n; i++) {
445: nz = ai[i + 1] - ai[i];
446: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
447: offset += nz;
448: loTriFactor->AA_h[offset] = 1.0;
449: offset += 1;
450: v += nz;
451: }
452: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
453: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
454: }
455: } catch (char *ex) {
456: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
457: }
458: }
459: PetscFunctionReturn(PETSC_SUCCESS);
460: }
462: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
463: {
464: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
465: PetscInt n = A->rmap->n;
466: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
467: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
468: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
469: const MatScalar *aa = a->a, *v;
470: PetscInt *AiUp, *AjUp;
471: PetscInt i, nz, nzUpper, offset;
473: PetscFunctionBegin;
474: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
475: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
476: try {
477: /* next, figure out the number of nonzeros in the upper triangular matrix. */
478: nzUpper = adiag[0] - adiag[n];
479: if (!upTriFactor) {
480: PetscScalar *AAUp;
482: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
484: /* Allocate Space for the upper triangular matrix */
485: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
486: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
488: /* Fill the upper triangular matrix */
489: AiUp[0] = (PetscInt)0;
490: AiUp[n] = nzUpper;
491: offset = nzUpper;
492: for (i = n - 1; i >= 0; i--) {
493: v = aa + adiag[i + 1] + 1;
494: vi = aj + adiag[i + 1] + 1;
496: /* number of elements NOT on the diagonal */
497: nz = adiag[i] - adiag[i + 1] - 1;
499: /* decrement the offset */
500: offset -= (nz + 1);
502: /* first, set the diagonal elements */
503: AjUp[offset] = (PetscInt)i;
504: AAUp[offset] = (MatScalar)1. / v[nz];
505: AiUp[i] = AiUp[i + 1] - (nz + 1);
507: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
508: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
509: }
511: /* allocate space for the triangular factor information */
512: PetscCall(PetscNew(&upTriFactor));
513: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
515: /* Create the matrix description */
516: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
517: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
518: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
519: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
520: #else
521: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
522: #endif
523: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
524: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
526: /* set the operation */
527: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
529: /* set the matrix */
530: upTriFactor->csrMat = new CsrMatrix;
531: upTriFactor->csrMat->num_rows = n;
532: upTriFactor->csrMat->num_cols = n;
533: upTriFactor->csrMat->num_entries = nzUpper;
535: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
536: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
538: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
539: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
541: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
542: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
544: /* Create the solve analysis information */
545: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
546: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
547: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
548: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
549: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
550: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
551: #endif
553: /* perform the solve analysis */
554: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
555: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
557: PetscCallCUDA(WaitForCUDA());
558: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
560: /* assign the pointer */
561: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
562: upTriFactor->AA_h = AAUp;
563: PetscCallCUDA(cudaFreeHost(AiUp));
564: PetscCallCUDA(cudaFreeHost(AjUp));
565: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
566: } else {
567: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
568: /* Fill the upper triangular matrix */
569: offset = nzUpper;
570: for (i = n - 1; i >= 0; i--) {
571: v = aa + adiag[i + 1] + 1;
573: /* number of elements NOT on the diagonal */
574: nz = adiag[i] - adiag[i + 1] - 1;
576: /* decrement the offset */
577: offset -= (nz + 1);
579: /* first, set the diagonal elements */
580: upTriFactor->AA_h[offset] = 1. / v[nz];
581: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
582: }
583: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
584: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
585: }
586: } catch (char *ex) {
587: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
588: }
589: }
590: PetscFunctionReturn(PETSC_SUCCESS);
591: }
592: #endif
594: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
595: {
596: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
597: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
598: IS isrow = a->row, isicol = a->icol;
599: PetscBool row_identity, col_identity;
600: PetscInt n = A->rmap->n;
602: PetscFunctionBegin;
603: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
604: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
605: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
606: #else
607: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
608: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
609: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
610: #endif
612: cusparseTriFactors->nnz = a->nz;
614: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
615: /* lower triangular indices */
616: PetscCall(ISIdentity(isrow, &row_identity));
617: if (!row_identity && !cusparseTriFactors->rpermIndices) {
618: const PetscInt *r;
620: PetscCall(ISGetIndices(isrow, &r));
621: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
622: cusparseTriFactors->rpermIndices->assign(r, r + n);
623: PetscCall(ISRestoreIndices(isrow, &r));
624: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
625: }
627: /* upper triangular indices */
628: PetscCall(ISIdentity(isicol, &col_identity));
629: if (!col_identity && !cusparseTriFactors->cpermIndices) {
630: const PetscInt *c;
632: PetscCall(ISGetIndices(isicol, &c));
633: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
634: cusparseTriFactors->cpermIndices->assign(c, c + n);
635: PetscCall(ISRestoreIndices(isicol, &c));
636: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
637: }
638: PetscFunctionReturn(PETSC_SUCCESS);
639: }
641: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
642: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
643: {
644: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
645: PetscInt m = A->rmap->n;
646: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
647: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
648: const MatScalar *Aa = a->a;
649: PetscInt *Mj, Mnz;
650: PetscScalar *Ma, *D;
652: PetscFunctionBegin;
653: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
654: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
655: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
656: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
657: Mnz = Ai[m]; // Unz (with the unit diagonal)
658: PetscCall(PetscMalloc1(Mnz, &Ma));
659: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
660: PetscCall(PetscMalloc1(m, &D)); // the diagonal
661: for (PetscInt i = 0; i < m; i++) {
662: PetscInt ulen = Ai[i + 1] - Ai[i];
663: Mj[Ai[i]] = i; // diagonal entry
664: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
665: }
666: // Copy M (U) from host to device
667: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
668: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
669: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
670: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
671: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
672: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
674: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
675: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
676: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
677: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
678: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
679: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
680: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
681: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
683: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
684: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
685: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
687: // Allocate work vectors in SpSv
688: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
689: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
691: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
692: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
694: // Query buffer sizes for SpSV and then allocate buffers
695: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
696: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
697: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
699: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
700: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
701: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
703: // Record for reuse
704: fs->csrVal_h = Ma;
705: fs->diag_h = D;
706: PetscCall(PetscFree(Mj));
707: }
708: // Copy the value
709: Ma = fs->csrVal_h;
710: D = fs->diag_h;
711: Mnz = Ai[m];
712: for (PetscInt i = 0; i < m; i++) {
713: D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
714: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
715: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
716: }
717: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
718: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
720: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
721: if (fs->updatedSpSVAnalysis) {
722: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724: } else
725: #endif
726: {
727: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
728: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
729: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
730: fs->updatedSpSVAnalysis = PETSC_TRUE;
731: }
732: }
733: PetscFunctionReturn(PETSC_SUCCESS);
734: }
736: // Solve Ut D U x = b
737: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
738: {
739: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
740: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
741: const PetscScalar *barray;
742: PetscScalar *xarray;
743: thrust::device_ptr<const PetscScalar> bGPU;
744: thrust::device_ptr<PetscScalar> xGPU;
745: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
746: PetscInt m = A->rmap->n;
748: PetscFunctionBegin;
749: PetscCall(PetscLogGpuTimeBegin());
750: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
751: PetscCall(VecCUDAGetArrayRead(b, &barray));
752: xGPU = thrust::device_pointer_cast(xarray);
753: bGPU = thrust::device_pointer_cast(barray);
755: // Reorder b with the row permutation if needed, and wrap the result in fs->X
756: if (fs->rpermIndices) {
757: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
758: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
759: } else {
760: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
761: }
763: // Solve Ut Y = X
764: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
765: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
767: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
768: // It is basically a vector element-wise multiplication, but cublas does not have it!
769: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
771: // Solve U X = Y
772: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
773: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
774: } else {
775: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
776: }
777: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
779: // Reorder X with the column permutation if needed, and put the result back to x
780: if (fs->cpermIndices) {
781: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
782: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
783: }
785: PetscCall(VecCUDARestoreArrayRead(b, &barray));
786: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
787: PetscCall(PetscLogGpuTimeEnd());
788: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
791: #else
792: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
793: {
794: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
795: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
796: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
797: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
798: PetscInt *AiUp, *AjUp;
799: PetscScalar *AAUp;
800: PetscScalar *AALo;
801: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
802: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
803: const PetscInt *ai = b->i, *aj = b->j, *vj;
804: const MatScalar *aa = b->a, *v;
806: PetscFunctionBegin;
807: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
808: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
809: try {
810: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
811: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
812: if (!upTriFactor && !loTriFactor) {
813: /* Allocate Space for the upper triangular matrix */
814: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
815: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
817: /* Fill the upper triangular matrix */
818: AiUp[0] = (PetscInt)0;
819: AiUp[n] = nzUpper;
820: offset = 0;
821: for (i = 0; i < n; i++) {
822: /* set the pointers */
823: v = aa + ai[i];
824: vj = aj + ai[i];
825: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
827: /* first, set the diagonal elements */
828: AjUp[offset] = (PetscInt)i;
829: AAUp[offset] = (MatScalar)1.0 / v[nz];
830: AiUp[i] = offset;
831: AALo[offset] = (MatScalar)1.0 / v[nz];
833: offset += 1;
834: if (nz > 0) {
835: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
836: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
837: for (j = offset; j < offset + nz; j++) {
838: AAUp[j] = -AAUp[j];
839: AALo[j] = AAUp[j] / v[nz];
840: }
841: offset += nz;
842: }
843: }
845: /* allocate space for the triangular factor information */
846: PetscCall(PetscNew(&upTriFactor));
847: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
849: /* Create the matrix description */
850: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
851: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
852: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
853: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
854: #else
855: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
856: #endif
857: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
858: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
860: /* set the matrix */
861: upTriFactor->csrMat = new CsrMatrix;
862: upTriFactor->csrMat->num_rows = A->rmap->n;
863: upTriFactor->csrMat->num_cols = A->cmap->n;
864: upTriFactor->csrMat->num_entries = a->nz;
866: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
867: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
869: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
870: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
872: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
873: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
875: /* set the operation */
876: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
878: /* Create the solve analysis information */
879: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
880: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
881: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
883: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
884: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
885: #endif
887: /* perform the solve analysis */
888: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
889: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
891: PetscCallCUDA(WaitForCUDA());
892: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
894: /* assign the pointer */
895: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
897: /* allocate space for the triangular factor information */
898: PetscCall(PetscNew(&loTriFactor));
899: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
901: /* Create the matrix description */
902: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
903: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
904: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
905: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
906: #else
907: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
908: #endif
909: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
910: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
912: /* set the operation */
913: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
915: /* set the matrix */
916: loTriFactor->csrMat = new CsrMatrix;
917: loTriFactor->csrMat->num_rows = A->rmap->n;
918: loTriFactor->csrMat->num_cols = A->cmap->n;
919: loTriFactor->csrMat->num_entries = a->nz;
921: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
922: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
924: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
925: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
927: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
928: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
930: /* Create the solve analysis information */
931: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
932: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
933: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
934: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
935: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
936: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
937: #endif
939: /* perform the solve analysis */
940: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
941: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
943: PetscCallCUDA(WaitForCUDA());
944: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
946: /* assign the pointer */
947: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
949: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
950: PetscCallCUDA(cudaFreeHost(AiUp));
951: PetscCallCUDA(cudaFreeHost(AjUp));
952: } else {
953: /* Fill the upper triangular matrix */
954: offset = 0;
955: for (i = 0; i < n; i++) {
956: /* set the pointers */
957: v = aa + ai[i];
958: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
960: /* first, set the diagonal elements */
961: AAUp[offset] = 1.0 / v[nz];
962: AALo[offset] = 1.0 / v[nz];
964: offset += 1;
965: if (nz > 0) {
966: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
967: for (j = offset; j < offset + nz; j++) {
968: AAUp[j] = -AAUp[j];
969: AALo[j] = AAUp[j] / v[nz];
970: }
971: offset += nz;
972: }
973: }
974: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
977: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
978: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
979: }
980: PetscCallCUDA(cudaFreeHost(AAUp));
981: PetscCallCUDA(cudaFreeHost(AALo));
982: } catch (char *ex) {
983: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
984: }
985: }
986: PetscFunctionReturn(PETSC_SUCCESS);
987: }
988: #endif
990: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
991: {
992: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
993: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
994: IS ip = a->row;
995: PetscBool perm_identity;
996: PetscInt n = A->rmap->n;
998: PetscFunctionBegin;
999: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1001: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1002: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1003: #else
1004: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1005: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1006: #endif
1007: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1009: A->offloadmask = PETSC_OFFLOAD_BOTH;
1011: /* lower triangular indices */
1012: PetscCall(ISIdentity(ip, &perm_identity));
1013: if (!perm_identity) {
1014: IS iip;
1015: const PetscInt *irip, *rip;
1017: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1018: PetscCall(ISGetIndices(iip, &irip));
1019: PetscCall(ISGetIndices(ip, &rip));
1020: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1021: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1022: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1023: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1024: PetscCall(ISRestoreIndices(iip, &irip));
1025: PetscCall(ISDestroy(&iip));
1026: PetscCall(ISRestoreIndices(ip, &rip));
1027: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1028: }
1029: PetscFunctionReturn(PETSC_SUCCESS);
1030: }
1032: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1033: {
1034: PetscFunctionBegin;
1035: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1036: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1037: B->offloadmask = PETSC_OFFLOAD_CPU;
1039: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1040: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042: #else
1043: /* determine which version of MatSolve needs to be used. */
1044: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1045: IS ip = b->row;
1046: PetscBool perm_identity;
1048: PetscCall(ISIdentity(ip, &perm_identity));
1049: if (perm_identity) {
1050: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1051: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1052: } else {
1053: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1054: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1055: }
1056: #endif
1057: B->ops->matsolve = NULL;
1058: B->ops->matsolvetranspose = NULL;
1060: /* get the triangular factors */
1061: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1062: PetscFunctionReturn(PETSC_SUCCESS);
1063: }
1065: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1066: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1067: {
1068: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1069: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1070: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1071: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1072: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1073: cusparseIndexBase_t indexBase;
1074: cusparseMatrixType_t matrixType;
1075: cusparseFillMode_t fillMode;
1076: cusparseDiagType_t diagType;
1078: PetscFunctionBegin;
1079: /* allocate space for the transpose of the lower triangular factor */
1080: PetscCall(PetscNew(&loTriFactorT));
1081: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1083: /* set the matrix descriptors of the lower triangular factor */
1084: matrixType = cusparseGetMatType(loTriFactor->descr);
1085: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1086: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1087: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1089: /* Create the matrix description */
1090: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1091: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1092: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1093: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1094: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1096: /* set the operation */
1097: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1099: /* allocate GPU space for the CSC of the lower triangular factor*/
1100: loTriFactorT->csrMat = new CsrMatrix;
1101: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1102: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1103: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1104: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1105: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1106: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1108: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1109: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1110: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1111: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1112: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1113: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1114: #endif
1116: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1117: {
1118: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1119: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1120: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1121: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1122: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1123: #else
1124: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1125: #endif
1126: PetscCallCUSPARSE(stat);
1127: }
1129: PetscCallCUDA(WaitForCUDA());
1130: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1132: /* Create the solve analysis information */
1133: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1134: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1135: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1136: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1137: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1138: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1139: #endif
1141: /* perform the solve analysis */
1142: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1143: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1145: PetscCallCUDA(WaitForCUDA());
1146: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1148: /* assign the pointer */
1149: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1151: /*********************************************/
1152: /* Now the Transpose of the Upper Tri Factor */
1153: /*********************************************/
1155: /* allocate space for the transpose of the upper triangular factor */
1156: PetscCall(PetscNew(&upTriFactorT));
1157: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1159: /* set the matrix descriptors of the upper triangular factor */
1160: matrixType = cusparseGetMatType(upTriFactor->descr);
1161: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1162: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1163: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1165: /* Create the matrix description */
1166: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1167: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1168: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1169: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1170: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1172: /* set the operation */
1173: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1175: /* allocate GPU space for the CSC of the upper triangular factor*/
1176: upTriFactorT->csrMat = new CsrMatrix;
1177: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1178: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1179: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1180: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1181: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1182: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1184: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1185: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1186: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1187: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1188: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1189: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1190: #endif
1192: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1193: {
1194: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1195: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1196: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1197: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1198: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1199: #else
1200: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1201: #endif
1202: PetscCallCUSPARSE(stat);
1203: }
1205: PetscCallCUDA(WaitForCUDA());
1206: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1208: /* Create the solve analysis information */
1209: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1210: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1211: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1212: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1213: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1214: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1215: #endif
1217: /* perform the solve analysis */
1218: /* christ, would it have killed you to put this stuff in a function????????? */
1219: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1220: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222: PetscCallCUDA(WaitForCUDA());
1223: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1225: /* assign the pointer */
1226: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227: PetscFunctionReturn(PETSC_SUCCESS);
1228: }
1229: #endif
1231: struct PetscScalarToPetscInt {
1232: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1233: };
1235: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1236: {
1237: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1238: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1239: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1240: cusparseStatus_t stat;
1241: cusparseIndexBase_t indexBase;
1243: PetscFunctionBegin;
1244: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1245: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1246: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1247: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1248: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1249: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1250: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1251: PetscCall(PetscLogGpuTimeBegin());
1252: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1253: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1254: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1255: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1256: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1257: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1258: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1260: /* set alpha and beta */
1261: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1262: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1263: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1264: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1265: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1268: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1269: CsrMatrix *matrixT = new CsrMatrix;
1270: matstructT->mat = matrixT;
1271: matrixT->num_rows = A->cmap->n;
1272: matrixT->num_cols = A->rmap->n;
1273: matrixT->num_entries = a->nz;
1274: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1275: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1276: matrixT->values = new THRUSTARRAY(a->nz);
1278: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1279: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1281: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1282: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1283: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1284: indexBase, cusparse_scalartype);
1285: PetscCallCUSPARSE(stat);
1286: #else
1287: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1288: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1290: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1291: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1292: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1293: */
1294: if (matrixT->num_entries) {
1295: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1296: PetscCallCUSPARSE(stat);
1298: } else {
1299: matstructT->matDescr = NULL;
1300: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1301: }
1302: #endif
1303: #endif
1304: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1305: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1306: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1307: #else
1308: CsrMatrix *temp = new CsrMatrix;
1309: CsrMatrix *tempT = new CsrMatrix;
1310: /* First convert HYB to CSR */
1311: temp->num_rows = A->rmap->n;
1312: temp->num_cols = A->cmap->n;
1313: temp->num_entries = a->nz;
1314: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1315: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1316: temp->values = new THRUSTARRAY(a->nz);
1318: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1319: PetscCallCUSPARSE(stat);
1321: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1322: tempT->num_rows = A->rmap->n;
1323: tempT->num_cols = A->cmap->n;
1324: tempT->num_entries = a->nz;
1325: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1326: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1327: tempT->values = new THRUSTARRAY(a->nz);
1329: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1330: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1331: PetscCallCUSPARSE(stat);
1333: /* Last, convert CSC to HYB */
1334: cusparseHybMat_t hybMat;
1335: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1336: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1337: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1338: PetscCallCUSPARSE(stat);
1340: /* assign the pointer */
1341: matstructT->mat = hybMat;
1342: A->transupdated = PETSC_TRUE;
1343: /* delete temporaries */
1344: if (tempT) {
1345: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1346: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1347: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1348: delete (CsrMatrix *)tempT;
1349: }
1350: if (temp) {
1351: if (temp->values) delete (THRUSTARRAY *)temp->values;
1352: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1353: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1354: delete (CsrMatrix *)temp;
1355: }
1356: #endif
1357: }
1358: }
1359: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1360: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1361: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1362: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1363: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1364: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1365: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1366: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1367: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1368: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1369: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1370: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1371: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1372: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1373: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1374: }
1375: if (!cusparsestruct->csr2csc_i) {
1376: THRUSTARRAY csr2csc_a(matrix->num_entries);
1377: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1379: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1380: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381: void *csr2cscBuffer;
1382: size_t csr2cscBufferSize;
1383: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1384: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1385: PetscCallCUSPARSE(stat);
1386: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1387: #endif
1389: if (matrix->num_entries) {
1390: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1391: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1392: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1394: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1395: should be filled with indexBase. So I just take a shortcut here.
1396: */
1397: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1398: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1399: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1400: PetscCallCUSPARSE(stat);
1401: #else
1402: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1403: PetscCallCUSPARSE(stat);
1404: #endif
1405: } else {
1406: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1407: }
1409: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1410: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1411: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1412: PetscCallCUDA(cudaFree(csr2cscBuffer));
1413: #endif
1414: }
1415: PetscCallThrust(
1416: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1417: }
1418: PetscCall(PetscLogGpuTimeEnd());
1419: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1420: /* the compressed row indices is not used for matTranspose */
1421: matstructT->cprowIndices = NULL;
1422: /* assign the pointer */
1423: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1424: A->transupdated = PETSC_TRUE;
1425: PetscFunctionReturn(PETSC_SUCCESS);
1426: }
1428: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1429: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1430: {
1431: const PetscScalar *barray;
1432: PetscScalar *xarray;
1433: thrust::device_ptr<const PetscScalar> bGPU;
1434: thrust::device_ptr<PetscScalar> xGPU;
1435: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1436: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1437: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1438: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1439: PetscInt m = A->rmap->n;
1441: PetscFunctionBegin;
1442: PetscCall(PetscLogGpuTimeBegin());
1443: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1444: PetscCall(VecCUDAGetArrayRead(b, &barray));
1445: xGPU = thrust::device_pointer_cast(xarray);
1446: bGPU = thrust::device_pointer_cast(barray);
1448: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1449: if (fs->rpermIndices) {
1450: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1451: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1452: } else {
1453: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1454: }
1456: // Solve L Y = X
1457: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1458: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1459: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1461: // Solve U X = Y
1462: if (fs->cpermIndices) {
1463: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1464: } else {
1465: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1466: }
1467: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1469: // Reorder X with the column permutation if needed, and put the result back to x
1470: if (fs->cpermIndices) {
1471: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1472: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1473: }
1474: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1475: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1476: PetscCall(PetscLogGpuTimeEnd());
1477: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1478: PetscFunctionReturn(PETSC_SUCCESS);
1479: }
1481: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1482: {
1483: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1484: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1485: const PetscScalar *barray;
1486: PetscScalar *xarray;
1487: thrust::device_ptr<const PetscScalar> bGPU;
1488: thrust::device_ptr<PetscScalar> xGPU;
1489: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1490: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1491: PetscInt m = A->rmap->n;
1493: PetscFunctionBegin;
1494: PetscCall(PetscLogGpuTimeBegin());
1495: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1496: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1497: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1498: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1500: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1501: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1502: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1503: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1504: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1505: }
1507: if (!fs->updatedTransposeSpSVAnalysis) {
1508: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1510: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1511: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1512: }
1514: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1515: PetscCall(VecCUDAGetArrayRead(b, &barray));
1516: xGPU = thrust::device_pointer_cast(xarray);
1517: bGPU = thrust::device_pointer_cast(barray);
1519: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1520: if (fs->rpermIndices) {
1521: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1522: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1523: } else {
1524: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1525: }
1527: // Solve Ut Y = X
1528: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1529: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1531: // Solve Lt X = Y
1532: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1533: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1534: } else {
1535: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1536: }
1537: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1539: // Reorder X with the column permutation if needed, and put the result back to x
1540: if (fs->cpermIndices) {
1541: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1542: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1543: }
1545: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1546: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1547: PetscCall(PetscLogGpuTimeEnd());
1548: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1549: PetscFunctionReturn(PETSC_SUCCESS);
1550: }
1551: #else
1552: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1553: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1554: {
1555: PetscInt n = xx->map->n;
1556: const PetscScalar *barray;
1557: PetscScalar *xarray;
1558: thrust::device_ptr<const PetscScalar> bGPU;
1559: thrust::device_ptr<PetscScalar> xGPU;
1560: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1561: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1562: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1563: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1565: PetscFunctionBegin;
1566: /* Analyze the matrix and create the transpose ... on the fly */
1567: if (!loTriFactorT && !upTriFactorT) {
1568: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1570: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1571: }
1573: /* Get the GPU pointers */
1574: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1575: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1576: xGPU = thrust::device_pointer_cast(xarray);
1577: bGPU = thrust::device_pointer_cast(barray);
1579: PetscCall(PetscLogGpuTimeBegin());
1580: /* First, reorder with the row permutation */
1581: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1583: /* First, solve U */
1584: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1585: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1587: /* Then, solve L */
1588: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1589: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1591: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1592: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1594: /* Copy the temporary to the full solution. */
1595: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1597: /* restore */
1598: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1599: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1600: PetscCall(PetscLogGpuTimeEnd());
1601: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1602: PetscFunctionReturn(PETSC_SUCCESS);
1603: }
1605: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1606: {
1607: const PetscScalar *barray;
1608: PetscScalar *xarray;
1609: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1610: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1611: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1612: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1614: PetscFunctionBegin;
1615: /* Analyze the matrix and create the transpose ... on the fly */
1616: if (!loTriFactorT && !upTriFactorT) {
1617: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1618: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1619: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1620: }
1622: /* Get the GPU pointers */
1623: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1624: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1626: PetscCall(PetscLogGpuTimeBegin());
1627: /* First, solve U */
1628: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1629: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1631: /* Then, solve L */
1632: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1633: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1635: /* restore */
1636: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1637: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1638: PetscCall(PetscLogGpuTimeEnd());
1639: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1640: PetscFunctionReturn(PETSC_SUCCESS);
1641: }
1643: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1644: {
1645: const PetscScalar *barray;
1646: PetscScalar *xarray;
1647: thrust::device_ptr<const PetscScalar> bGPU;
1648: thrust::device_ptr<PetscScalar> xGPU;
1649: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1650: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1651: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1652: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1654: PetscFunctionBegin;
1655: /* Get the GPU pointers */
1656: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1657: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1658: xGPU = thrust::device_pointer_cast(xarray);
1659: bGPU = thrust::device_pointer_cast(barray);
1661: PetscCall(PetscLogGpuTimeBegin());
1662: /* First, reorder with the row permutation */
1663: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1665: /* Next, solve L */
1666: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1667: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1669: /* Then, solve U */
1670: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1671: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1673: /* Last, reorder with the column permutation */
1674: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1676: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1677: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1678: PetscCall(PetscLogGpuTimeEnd());
1679: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1680: PetscFunctionReturn(PETSC_SUCCESS);
1681: }
1683: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1684: {
1685: const PetscScalar *barray;
1686: PetscScalar *xarray;
1687: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1688: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1689: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1690: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1692: PetscFunctionBegin;
1693: /* Get the GPU pointers */
1694: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1695: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1697: PetscCall(PetscLogGpuTimeBegin());
1698: /* First, solve L */
1699: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1700: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1702: /* Next, solve U */
1703: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1704: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1706: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1707: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1708: PetscCall(PetscLogGpuTimeEnd());
1709: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1710: PetscFunctionReturn(PETSC_SUCCESS);
1711: }
1712: #endif
1714: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1715: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1716: {
1717: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1718: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1719: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1720: CsrMatrix *Acsr;
1721: PetscInt m, nz;
1722: PetscBool flg;
1724: PetscFunctionBegin;
1725: if (PetscDefined(USE_DEBUG)) {
1726: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1727: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1728: }
1730: /* Copy A's value to fact */
1731: m = fact->rmap->n;
1732: nz = aij->nz;
1733: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1734: Acsr = (CsrMatrix *)Acusp->mat->mat;
1735: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1737: PetscCall(PetscLogGpuTimeBegin());
1738: /* Factorize fact inplace */
1739: if (m)
1740: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1741: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1742: if (PetscDefined(USE_DEBUG)) {
1743: int numerical_zero;
1744: cusparseStatus_t status;
1745: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1746: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1747: }
1749: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1750: if (fs->updatedSpSVAnalysis) {
1751: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753: } else
1754: #endif
1755: {
1756: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1757: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1758: */
1759: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1761: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1763: fs->updatedSpSVAnalysis = PETSC_TRUE;
1764: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1765: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1766: }
1768: fact->offloadmask = PETSC_OFFLOAD_GPU;
1769: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1770: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1771: fact->ops->matsolve = NULL;
1772: fact->ops->matsolvetranspose = NULL;
1773: PetscCall(PetscLogGpuTimeEnd());
1774: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1775: PetscFunctionReturn(PETSC_SUCCESS);
1776: }
1778: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1779: {
1780: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1781: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1782: PetscInt m, nz;
1784: PetscFunctionBegin;
1785: if (PetscDefined(USE_DEBUG)) {
1786: PetscInt i;
1787: PetscBool flg, missing;
1789: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1790: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1791: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1792: PetscCall(MatMissingDiagonal(A, &missing, &i));
1793: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1794: }
1796: /* Free the old stale stuff */
1797: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1799: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1800: but they will not be used. Allocate them just for easy debugging.
1801: */
1802: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1804: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1805: fact->factortype = MAT_FACTOR_ILU;
1806: fact->info.factor_mallocs = 0;
1807: fact->info.fill_ratio_given = info->fill;
1808: fact->info.fill_ratio_needed = 1.0;
1810: aij->row = NULL;
1811: aij->col = NULL;
1813: /* ====================================================================== */
1814: /* Copy A's i, j to fact and also allocate the value array of fact. */
1815: /* We'll do in-place factorization on fact */
1816: /* ====================================================================== */
1817: const int *Ai, *Aj;
1819: m = fact->rmap->n;
1820: nz = aij->nz;
1822: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1823: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1824: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1825: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1826: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1827: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1829: /* ====================================================================== */
1830: /* Create descriptors for M, L, U */
1831: /* ====================================================================== */
1832: cusparseFillMode_t fillMode;
1833: cusparseDiagType_t diagType;
1835: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1836: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1837: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1839: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1840: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1841: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1842: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1843: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1844: */
1845: fillMode = CUSPARSE_FILL_MODE_LOWER;
1846: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1847: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1848: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1849: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1851: fillMode = CUSPARSE_FILL_MODE_UPPER;
1852: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1853: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1854: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1855: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1857: /* ========================================================================= */
1858: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1859: /* ========================================================================= */
1860: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1861: if (m)
1862: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1863: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1865: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1866: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1868: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1869: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1871: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1872: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1874: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1875: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1877: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1878: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1879: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1880: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1881: */
1882: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1883: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1884: fs->spsvBuffer_L = fs->factBuffer_M;
1885: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1886: } else {
1887: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1888: fs->spsvBuffer_U = fs->factBuffer_M;
1889: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1890: }
1892: /* ========================================================================== */
1893: /* Perform analysis of ilu0 on M, SpSv on L and U */
1894: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1895: /* ========================================================================== */
1896: int structural_zero;
1897: cusparseStatus_t status;
1899: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1900: if (m)
1901: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1902: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1903: if (PetscDefined(USE_DEBUG)) {
1904: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1905: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1906: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1907: }
1909: /* Estimate FLOPs of the numeric factorization */
1910: {
1911: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1912: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1913: PetscLogDouble flops = 0.0;
1915: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1916: Ai = Aseq->i;
1917: Adiag = Aseq->diag;
1918: for (PetscInt i = 0; i < m; i++) {
1919: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1920: nzRow = Ai[i + 1] - Ai[i];
1921: nzLeft = Adiag[i] - Ai[i];
1922: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1923: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1924: */
1925: nzLeft = (nzRow - 1) / 2;
1926: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1927: }
1928: }
1929: fs->numericFactFlops = flops;
1930: }
1931: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1932: PetscFunctionReturn(PETSC_SUCCESS);
1933: }
1935: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1936: {
1937: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1938: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1939: const PetscScalar *barray;
1940: PetscScalar *xarray;
1942: PetscFunctionBegin;
1943: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1944: PetscCall(VecCUDAGetArrayRead(b, &barray));
1945: PetscCall(PetscLogGpuTimeBegin());
1947: /* Solve L*y = b */
1948: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1949: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1950: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1951: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1953: /* Solve Lt*x = y */
1954: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1955: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1956: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1958: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1959: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1961: PetscCall(PetscLogGpuTimeEnd());
1962: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1963: PetscFunctionReturn(PETSC_SUCCESS);
1964: }
1966: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1967: {
1968: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1969: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1970: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1971: CsrMatrix *Acsr;
1972: PetscInt m, nz;
1973: PetscBool flg;
1975: PetscFunctionBegin;
1976: if (PetscDefined(USE_DEBUG)) {
1977: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1978: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1979: }
1981: /* Copy A's value to fact */
1982: m = fact->rmap->n;
1983: nz = aij->nz;
1984: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1985: Acsr = (CsrMatrix *)Acusp->mat->mat;
1986: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1988: /* Factorize fact inplace */
1989: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1990: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1991: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1992: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1993: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1994: */
1995: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1996: if (PetscDefined(USE_DEBUG)) {
1997: int numerical_zero;
1998: cusparseStatus_t status;
1999: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2000: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2001: }
2003: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2004: if (fs->updatedSpSVAnalysis) {
2005: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007: } else
2008: #endif
2009: {
2010: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2012: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2013: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2014: */
2015: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2016: fs->updatedSpSVAnalysis = PETSC_TRUE;
2017: }
2019: fact->offloadmask = PETSC_OFFLOAD_GPU;
2020: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
2021: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2022: fact->ops->matsolve = NULL;
2023: fact->ops->matsolvetranspose = NULL;
2024: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2025: PetscFunctionReturn(PETSC_SUCCESS);
2026: }
2028: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2029: {
2030: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2031: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2032: PetscInt m, nz;
2034: PetscFunctionBegin;
2035: if (PetscDefined(USE_DEBUG)) {
2036: PetscInt i;
2037: PetscBool flg, missing;
2039: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2040: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2041: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2042: PetscCall(MatMissingDiagonal(A, &missing, &i));
2043: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2044: }
2046: /* Free the old stale stuff */
2047: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2049: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2050: but they will not be used. Allocate them just for easy debugging.
2051: */
2052: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2054: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2055: fact->factortype = MAT_FACTOR_ICC;
2056: fact->info.factor_mallocs = 0;
2057: fact->info.fill_ratio_given = info->fill;
2058: fact->info.fill_ratio_needed = 1.0;
2060: aij->row = NULL;
2061: aij->col = NULL;
2063: /* ====================================================================== */
2064: /* Copy A's i, j to fact and also allocate the value array of fact. */
2065: /* We'll do in-place factorization on fact */
2066: /* ====================================================================== */
2067: const int *Ai, *Aj;
2069: m = fact->rmap->n;
2070: nz = aij->nz;
2072: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2073: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2074: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2075: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2076: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2077: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2079: /* ====================================================================== */
2080: /* Create mat descriptors for M, L */
2081: /* ====================================================================== */
2082: cusparseFillMode_t fillMode;
2083: cusparseDiagType_t diagType;
2085: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2086: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2087: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2089: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2090: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2091: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2092: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2093: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2094: */
2095: fillMode = CUSPARSE_FILL_MODE_LOWER;
2096: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2097: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2098: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2099: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2101: /* ========================================================================= */
2102: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2103: /* ========================================================================= */
2104: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2105: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2107: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2108: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2110: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2111: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2113: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2114: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2116: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2117: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2119: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2120: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2121: */
2122: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2123: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2124: fs->spsvBuffer_L = fs->factBuffer_M;
2125: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2126: } else {
2127: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2128: fs->spsvBuffer_Lt = fs->factBuffer_M;
2129: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2130: }
2132: /* ========================================================================== */
2133: /* Perform analysis of ic0 on M */
2134: /* The lower triangular part of M has the same sparsity pattern as L */
2135: /* ========================================================================== */
2136: int structural_zero;
2137: cusparseStatus_t status;
2139: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2140: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2141: if (PetscDefined(USE_DEBUG)) {
2142: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2143: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2144: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2145: }
2147: /* Estimate FLOPs of the numeric factorization */
2148: {
2149: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2150: PetscInt *Ai, nzRow, nzLeft;
2151: PetscLogDouble flops = 0.0;
2153: Ai = Aseq->i;
2154: for (PetscInt i = 0; i < m; i++) {
2155: nzRow = Ai[i + 1] - Ai[i];
2156: if (nzRow > 1) {
2157: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2158: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2159: */
2160: nzLeft = (nzRow - 1) / 2;
2161: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2162: }
2163: }
2164: fs->numericFactFlops = flops;
2165: }
2166: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2167: PetscFunctionReturn(PETSC_SUCCESS);
2168: }
2169: #endif
2171: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2172: {
2173: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2174: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2176: PetscFunctionBegin;
2177: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2178: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2179: B->offloadmask = PETSC_OFFLOAD_CPU;
2181: if (!cusparsestruct->use_cpu_solve) {
2182: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2183: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2184: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2185: #else
2186: /* determine which version of MatSolve needs to be used. */
2187: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2188: IS isrow = b->row, iscol = b->col;
2189: PetscBool row_identity, col_identity;
2191: PetscCall(ISIdentity(isrow, &row_identity));
2192: PetscCall(ISIdentity(iscol, &col_identity));
2193: if (row_identity && col_identity) {
2194: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2195: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2196: } else {
2197: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2198: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2199: }
2200: #endif
2201: }
2202: B->ops->matsolve = NULL;
2203: B->ops->matsolvetranspose = NULL;
2205: /* get the triangular factors */
2206: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2207: PetscFunctionReturn(PETSC_SUCCESS);
2208: }
2210: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2211: {
2212: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2214: PetscFunctionBegin;
2215: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218: PetscFunctionReturn(PETSC_SUCCESS);
2219: }
2221: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2222: {
2223: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2225: PetscFunctionBegin;
2226: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2227: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2228: if (!info->factoronhost) {
2229: PetscCall(ISIdentity(isrow, &row_identity));
2230: PetscCall(ISIdentity(iscol, &col_identity));
2231: }
2232: if (!info->levels && row_identity && col_identity) {
2233: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2234: } else
2235: #endif
2236: {
2237: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2238: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2239: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2240: }
2241: PetscFunctionReturn(PETSC_SUCCESS);
2242: }
2244: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2245: {
2246: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2248: PetscFunctionBegin;
2249: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2250: PetscBool perm_identity = PETSC_FALSE;
2251: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2252: if (!info->levels && perm_identity) {
2253: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2254: } else
2255: #endif
2256: {
2257: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2258: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2259: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2260: }
2261: PetscFunctionReturn(PETSC_SUCCESS);
2262: }
2264: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2265: {
2266: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2268: PetscFunctionBegin;
2269: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2270: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2271: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2272: PetscFunctionReturn(PETSC_SUCCESS);
2273: }
2275: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2276: {
2277: PetscFunctionBegin;
2278: *type = MATSOLVERCUSPARSE;
2279: PetscFunctionReturn(PETSC_SUCCESS);
2280: }
2282: /*MC
2283: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2284: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2285: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2286: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2287: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2288: algorithms are not recommended. This class does NOT support direct solver operations.
2290: Level: beginner
2292: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2293: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2294: M*/
2296: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2297: {
2298: PetscInt n = A->rmap->n;
2300: PetscFunctionBegin;
2301: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2302: PetscCall(MatSetSizes(*B, n, n, n, n));
2303: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2304: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2306: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2307: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2308: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2309: if (!A->boundtocpu) {
2310: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2311: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2312: } else {
2313: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2314: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2315: }
2316: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2317: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2318: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2319: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2320: if (!A->boundtocpu) {
2321: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2322: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2323: } else {
2324: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2325: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2326: }
2327: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2328: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2329: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2331: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2332: (*B)->canuseordering = PETSC_TRUE;
2333: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2334: PetscFunctionReturn(PETSC_SUCCESS);
2335: }
2337: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2338: {
2339: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2340: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2341: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2342: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2343: #endif
2345: PetscFunctionBegin;
2346: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2347: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2348: if (A->factortype == MAT_FACTOR_NONE) {
2349: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2350: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2351: }
2352: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2353: else if (fs->csrVal) {
2354: /* We have a factorized matrix on device and are able to copy it to host */
2355: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2356: }
2357: #endif
2358: else
2359: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2360: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2361: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2362: A->offloadmask = PETSC_OFFLOAD_BOTH;
2363: }
2364: PetscFunctionReturn(PETSC_SUCCESS);
2365: }
2367: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2368: {
2369: PetscFunctionBegin;
2370: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2371: *array = ((Mat_SeqAIJ *)A->data)->a;
2372: PetscFunctionReturn(PETSC_SUCCESS);
2373: }
2375: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2376: {
2377: PetscFunctionBegin;
2378: A->offloadmask = PETSC_OFFLOAD_CPU;
2379: *array = NULL;
2380: PetscFunctionReturn(PETSC_SUCCESS);
2381: }
2383: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2384: {
2385: PetscFunctionBegin;
2386: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2387: *array = ((Mat_SeqAIJ *)A->data)->a;
2388: PetscFunctionReturn(PETSC_SUCCESS);
2389: }
2391: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2392: {
2393: PetscFunctionBegin;
2394: *array = NULL;
2395: PetscFunctionReturn(PETSC_SUCCESS);
2396: }
2398: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2399: {
2400: PetscFunctionBegin;
2401: *array = ((Mat_SeqAIJ *)A->data)->a;
2402: PetscFunctionReturn(PETSC_SUCCESS);
2403: }
2405: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2406: {
2407: PetscFunctionBegin;
2408: A->offloadmask = PETSC_OFFLOAD_CPU;
2409: *array = NULL;
2410: PetscFunctionReturn(PETSC_SUCCESS);
2411: }
2413: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2414: {
2415: Mat_SeqAIJCUSPARSE *cusp;
2416: CsrMatrix *matrix;
2418: PetscFunctionBegin;
2419: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2420: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2421: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2422: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2423: matrix = (CsrMatrix *)cusp->mat->mat;
2425: if (i) {
2426: #if !defined(PETSC_USE_64BIT_INDICES)
2427: *i = matrix->row_offsets->data().get();
2428: #else
2429: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2430: #endif
2431: }
2432: if (j) {
2433: #if !defined(PETSC_USE_64BIT_INDICES)
2434: *j = matrix->column_indices->data().get();
2435: #else
2436: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2437: #endif
2438: }
2439: if (a) *a = matrix->values->data().get();
2440: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2441: PetscFunctionReturn(PETSC_SUCCESS);
2442: }
2444: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2445: {
2446: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2447: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2448: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2449: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2450: cusparseStatus_t stat;
2451: PetscBool both = PETSC_TRUE;
2453: PetscFunctionBegin;
2454: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2455: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2456: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2457: CsrMatrix *matrix;
2458: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2460: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2461: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2462: matrix->values->assign(a->a, a->a + a->nz);
2463: PetscCallCUDA(WaitForCUDA());
2464: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2465: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2466: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2467: } else {
2468: PetscInt nnz;
2469: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2470: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2471: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2472: delete cusparsestruct->workVector;
2473: delete cusparsestruct->rowoffsets_gpu;
2474: cusparsestruct->workVector = NULL;
2475: cusparsestruct->rowoffsets_gpu = NULL;
2476: try {
2477: if (a->compressedrow.use) {
2478: m = a->compressedrow.nrows;
2479: ii = a->compressedrow.i;
2480: ridx = a->compressedrow.rindex;
2481: } else {
2482: m = A->rmap->n;
2483: ii = a->i;
2484: ridx = NULL;
2485: }
2486: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2487: if (!a->a) {
2488: nnz = ii[m];
2489: both = PETSC_FALSE;
2490: } else nnz = a->nz;
2491: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2493: /* create cusparse matrix */
2494: cusparsestruct->nrows = m;
2495: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2496: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2497: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2498: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2500: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2501: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2502: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2503: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2504: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2508: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2509: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2510: /* set the matrix */
2511: CsrMatrix *mat = new CsrMatrix;
2512: mat->num_rows = m;
2513: mat->num_cols = A->cmap->n;
2514: mat->num_entries = nnz;
2515: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516: mat->row_offsets->assign(ii, ii + m + 1);
2518: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519: mat->column_indices->assign(a->j, a->j + nnz);
2521: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522: if (a->a) mat->values->assign(a->a, a->a + nnz);
2524: /* assign the pointer */
2525: matstruct->mat = mat;
2526: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2527: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2528: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2529: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2530: PetscCallCUSPARSE(stat);
2531: }
2532: #endif
2533: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2534: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2535: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2536: #else
2537: CsrMatrix *mat = new CsrMatrix;
2538: mat->num_rows = m;
2539: mat->num_cols = A->cmap->n;
2540: mat->num_entries = nnz;
2541: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2542: mat->row_offsets->assign(ii, ii + m + 1);
2544: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2545: mat->column_indices->assign(a->j, a->j + nnz);
2547: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2548: if (a->a) mat->values->assign(a->a, a->a + nnz);
2550: cusparseHybMat_t hybMat;
2551: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2552: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2553: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2554: PetscCallCUSPARSE(stat);
2555: /* assign the pointer */
2556: matstruct->mat = hybMat;
2558: if (mat) {
2559: if (mat->values) delete (THRUSTARRAY *)mat->values;
2560: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2561: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2562: delete (CsrMatrix *)mat;
2563: }
2564: #endif
2565: }
2567: /* assign the compressed row indices */
2568: if (a->compressedrow.use) {
2569: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2570: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2571: matstruct->cprowIndices->assign(ridx, ridx + m);
2572: tmp = m;
2573: } else {
2574: cusparsestruct->workVector = NULL;
2575: matstruct->cprowIndices = NULL;
2576: tmp = 0;
2577: }
2578: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2580: /* assign the pointer */
2581: cusparsestruct->mat = matstruct;
2582: } catch (char *ex) {
2583: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2584: }
2585: PetscCallCUDA(WaitForCUDA());
2586: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2587: cusparsestruct->nonzerostate = A->nonzerostate;
2588: }
2589: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2590: }
2591: PetscFunctionReturn(PETSC_SUCCESS);
2592: }
2594: struct VecCUDAPlusEquals {
2595: template <typename Tuple>
2596: __host__ __device__ void operator()(Tuple t)
2597: {
2598: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2599: }
2600: };
2602: struct VecCUDAEquals {
2603: template <typename Tuple>
2604: __host__ __device__ void operator()(Tuple t)
2605: {
2606: thrust::get<1>(t) = thrust::get<0>(t);
2607: }
2608: };
2610: struct VecCUDAEqualsReverse {
2611: template <typename Tuple>
2612: __host__ __device__ void operator()(Tuple t)
2613: {
2614: thrust::get<0>(t) = thrust::get<1>(t);
2615: }
2616: };
2618: struct MatMatCusparse {
2619: PetscBool cisdense;
2620: PetscScalar *Bt;
2621: Mat X;
2622: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2623: PetscLogDouble flops;
2624: CsrMatrix *Bcsr;
2626: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2627: cusparseSpMatDescr_t matSpBDescr;
2628: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2629: cusparseDnMatDescr_t matBDescr;
2630: cusparseDnMatDescr_t matCDescr;
2631: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2632: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2633: void *dBuffer4;
2634: void *dBuffer5;
2635: #endif
2636: size_t mmBufferSize;
2637: void *mmBuffer;
2638: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2639: cusparseSpGEMMDescr_t spgemmDesc;
2640: #endif
2641: };
2643: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2644: {
2645: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2647: PetscFunctionBegin;
2648: PetscCallCUDA(cudaFree(mmdata->Bt));
2649: delete mmdata->Bcsr;
2650: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2651: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2652: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2653: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2654: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2655: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2656: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2657: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2658: #endif
2659: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2660: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2661: #endif
2662: PetscCall(MatDestroy(&mmdata->X));
2663: PetscCall(PetscFree(data));
2664: PetscFunctionReturn(PETSC_SUCCESS);
2665: }
2667: #include <../src/mat/impls/dense/seq/dense.h>
2669: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2670: {
2671: Mat_Product *product = C->product;
2672: Mat A, B;
2673: PetscInt m, n, blda, clda;
2674: PetscBool flg, biscuda;
2675: Mat_SeqAIJCUSPARSE *cusp;
2676: cusparseStatus_t stat;
2677: cusparseOperation_t opA;
2678: const PetscScalar *barray;
2679: PetscScalar *carray;
2680: MatMatCusparse *mmdata;
2681: Mat_SeqAIJCUSPARSEMultStruct *mat;
2682: CsrMatrix *csrmat;
2684: PetscFunctionBegin;
2685: MatCheckProduct(C, 1);
2686: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2687: mmdata = (MatMatCusparse *)product->data;
2688: A = product->A;
2689: B = product->B;
2690: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2691: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2692: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2693: Instead of silently accepting the wrong answer, I prefer to raise the error */
2694: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2695: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2696: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2697: switch (product->type) {
2698: case MATPRODUCT_AB:
2699: case MATPRODUCT_PtAP:
2700: mat = cusp->mat;
2701: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2702: m = A->rmap->n;
2703: n = B->cmap->n;
2704: break;
2705: case MATPRODUCT_AtB:
2706: if (!A->form_explicit_transpose) {
2707: mat = cusp->mat;
2708: opA = CUSPARSE_OPERATION_TRANSPOSE;
2709: } else {
2710: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2711: mat = cusp->matTranspose;
2712: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2713: }
2714: m = A->cmap->n;
2715: n = B->cmap->n;
2716: break;
2717: case MATPRODUCT_ABt:
2718: case MATPRODUCT_RARt:
2719: mat = cusp->mat;
2720: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2721: m = A->rmap->n;
2722: n = B->rmap->n;
2723: break;
2724: default:
2725: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2726: }
2727: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2728: csrmat = (CsrMatrix *)mat->mat;
2729: /* if the user passed a CPU matrix, copy the data to the GPU */
2730: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2731: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2732: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2734: PetscCall(MatDenseGetLDA(B, &blda));
2735: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2736: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2737: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2738: } else {
2739: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2740: PetscCall(MatDenseGetLDA(C, &clda));
2741: }
2743: PetscCall(PetscLogGpuTimeBegin());
2744: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2745: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2746: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2747: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2748: #else
2749: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2750: #endif
2752: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2753: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2754: size_t mmBufferSize;
2755: if (mmdata->initialized && mmdata->Blda != blda) {
2756: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2757: mmdata->matBDescr = NULL;
2758: }
2759: if (!mmdata->matBDescr) {
2760: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2761: mmdata->Blda = blda;
2762: }
2764: if (mmdata->initialized && mmdata->Clda != clda) {
2765: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2766: mmdata->matCDescr = NULL;
2767: }
2768: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2769: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2770: mmdata->Clda = clda;
2771: }
2773: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2774: if (matADescr) {
2775: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2776: matADescr = NULL;
2777: }
2778: #endif
2780: if (!matADescr) {
2781: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2782: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2783: PetscCallCUSPARSE(stat);
2784: }
2786: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2788: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2789: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2790: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2791: mmdata->mmBufferSize = mmBufferSize;
2792: }
2794: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2795: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2796: #endif
2798: mmdata->initialized = PETSC_TRUE;
2799: } else {
2800: /* to be safe, always update pointers of the mats */
2801: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2802: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2803: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2804: }
2806: /* do cusparseSpMM, which supports transpose on B */
2807: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2808: #else
2809: PetscInt k;
2810: /* cusparseXcsrmm does not support transpose on B */
2811: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2812: cublasHandle_t cublasv2handle;
2813: cublasStatus_t cerr;
2815: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2816: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2817: PetscCallCUBLAS(cerr);
2818: blda = B->cmap->n;
2819: k = B->cmap->n;
2820: } else {
2821: k = B->rmap->n;
2822: }
2824: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2825: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2826: PetscCallCUSPARSE(stat);
2827: #endif
2828: PetscCall(PetscLogGpuTimeEnd());
2829: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2830: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2831: if (product->type == MATPRODUCT_RARt) {
2832: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2833: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2834: } else if (product->type == MATPRODUCT_PtAP) {
2835: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2836: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2837: } else {
2838: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2839: }
2840: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2841: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2842: PetscFunctionReturn(PETSC_SUCCESS);
2843: }
2845: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2846: {
2847: Mat_Product *product = C->product;
2848: Mat A, B;
2849: PetscInt m, n;
2850: PetscBool cisdense, flg;
2851: MatMatCusparse *mmdata;
2852: Mat_SeqAIJCUSPARSE *cusp;
2854: PetscFunctionBegin;
2855: MatCheckProduct(C, 1);
2856: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2857: A = product->A;
2858: B = product->B;
2859: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2860: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2861: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2862: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2863: switch (product->type) {
2864: case MATPRODUCT_AB:
2865: m = A->rmap->n;
2866: n = B->cmap->n;
2867: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2868: break;
2869: case MATPRODUCT_AtB:
2870: m = A->cmap->n;
2871: n = B->cmap->n;
2872: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2873: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2874: break;
2875: case MATPRODUCT_ABt:
2876: m = A->rmap->n;
2877: n = B->rmap->n;
2878: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2879: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2880: break;
2881: case MATPRODUCT_PtAP:
2882: m = B->cmap->n;
2883: n = B->cmap->n;
2884: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2885: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2886: break;
2887: case MATPRODUCT_RARt:
2888: m = B->rmap->n;
2889: n = B->rmap->n;
2890: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2891: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2892: break;
2893: default:
2894: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2895: }
2896: PetscCall(MatSetSizes(C, m, n, m, n));
2897: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2898: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2899: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2901: /* product data */
2902: PetscCall(PetscNew(&mmdata));
2903: mmdata->cisdense = cisdense;
2904: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2905: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2906: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2907: #endif
2908: /* for these products we need intermediate storage */
2909: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2910: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2911: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2912: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2913: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2914: } else {
2915: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2916: }
2917: }
2918: C->product->data = mmdata;
2919: C->product->destroy = MatDestroy_MatMatCusparse;
2921: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2922: PetscFunctionReturn(PETSC_SUCCESS);
2923: }
2925: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2926: {
2927: Mat_Product *product = C->product;
2928: Mat A, B;
2929: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2930: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2931: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2932: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2933: PetscBool flg;
2934: cusparseStatus_t stat;
2935: MatProductType ptype;
2936: MatMatCusparse *mmdata;
2937: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2938: cusparseSpMatDescr_t BmatSpDescr;
2939: #endif
2940: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2942: PetscFunctionBegin;
2943: MatCheckProduct(C, 1);
2944: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2945: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2946: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2947: mmdata = (MatMatCusparse *)C->product->data;
2948: A = product->A;
2949: B = product->B;
2950: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2951: mmdata->reusesym = PETSC_FALSE;
2952: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2953: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2954: Cmat = Ccusp->mat;
2955: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2956: Ccsr = (CsrMatrix *)Cmat->mat;
2957: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2958: goto finalize;
2959: }
2960: if (!c->nz) goto finalize;
2961: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2962: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2963: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2964: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2965: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2968: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2969: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2970: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2971: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2974: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2976: ptype = product->type;
2977: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2978: ptype = MATPRODUCT_AB;
2979: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2980: }
2981: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2982: ptype = MATPRODUCT_AB;
2983: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2984: }
2985: switch (ptype) {
2986: case MATPRODUCT_AB:
2987: Amat = Acusp->mat;
2988: Bmat = Bcusp->mat;
2989: break;
2990: case MATPRODUCT_AtB:
2991: Amat = Acusp->matTranspose;
2992: Bmat = Bcusp->mat;
2993: break;
2994: case MATPRODUCT_ABt:
2995: Amat = Acusp->mat;
2996: Bmat = Bcusp->matTranspose;
2997: break;
2998: default:
2999: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3000: }
3001: Cmat = Ccusp->mat;
3002: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3003: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3004: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3005: Acsr = (CsrMatrix *)Amat->mat;
3006: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3007: Ccsr = (CsrMatrix *)Cmat->mat;
3008: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3009: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3010: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3011: PetscCall(PetscLogGpuTimeBegin());
3012: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3013: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3014: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3015: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3016: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3017: PetscCallCUSPARSE(stat);
3018: #else
3019: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3020: PetscCallCUSPARSE(stat);
3021: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3022: PetscCallCUSPARSE(stat);
3023: #endif
3024: #else
3025: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3026: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3027: PetscCallCUSPARSE(stat);
3028: #endif
3029: PetscCall(PetscLogGpuFlops(mmdata->flops));
3030: PetscCallCUDA(WaitForCUDA());
3031: PetscCall(PetscLogGpuTimeEnd());
3032: C->offloadmask = PETSC_OFFLOAD_GPU;
3033: finalize:
3034: /* shorter version of MatAssemblyEnd_SeqAIJ */
3035: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3036: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3037: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3038: c->reallocs = 0;
3039: C->info.mallocs += 0;
3040: C->info.nz_unneeded = 0;
3041: C->assembled = C->was_assembled = PETSC_TRUE;
3042: C->num_ass++;
3043: PetscFunctionReturn(PETSC_SUCCESS);
3044: }
3046: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3047: {
3048: Mat_Product *product = C->product;
3049: Mat A, B;
3050: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3051: Mat_SeqAIJ *a, *b, *c;
3052: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3053: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3054: PetscInt i, j, m, n, k;
3055: PetscBool flg;
3056: cusparseStatus_t stat;
3057: MatProductType ptype;
3058: MatMatCusparse *mmdata;
3059: PetscLogDouble flops;
3060: PetscBool biscompressed, ciscompressed;
3061: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3062: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3063: cusparseSpMatDescr_t BmatSpDescr;
3064: #else
3065: int cnz;
3066: #endif
3067: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3069: PetscFunctionBegin;
3070: MatCheckProduct(C, 1);
3071: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3072: A = product->A;
3073: B = product->B;
3074: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3075: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3076: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3077: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3078: a = (Mat_SeqAIJ *)A->data;
3079: b = (Mat_SeqAIJ *)B->data;
3080: /* product data */
3081: PetscCall(PetscNew(&mmdata));
3082: C->product->data = mmdata;
3083: C->product->destroy = MatDestroy_MatMatCusparse;
3085: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3086: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3087: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3088: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3089: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3090: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3092: ptype = product->type;
3093: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3094: ptype = MATPRODUCT_AB;
3095: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3096: }
3097: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3098: ptype = MATPRODUCT_AB;
3099: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3100: }
3101: biscompressed = PETSC_FALSE;
3102: ciscompressed = PETSC_FALSE;
3103: switch (ptype) {
3104: case MATPRODUCT_AB:
3105: m = A->rmap->n;
3106: n = B->cmap->n;
3107: k = A->cmap->n;
3108: Amat = Acusp->mat;
3109: Bmat = Bcusp->mat;
3110: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3111: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3112: break;
3113: case MATPRODUCT_AtB:
3114: m = A->cmap->n;
3115: n = B->cmap->n;
3116: k = A->rmap->n;
3117: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3118: Amat = Acusp->matTranspose;
3119: Bmat = Bcusp->mat;
3120: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3121: break;
3122: case MATPRODUCT_ABt:
3123: m = A->rmap->n;
3124: n = B->rmap->n;
3125: k = A->cmap->n;
3126: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3127: Amat = Acusp->mat;
3128: Bmat = Bcusp->matTranspose;
3129: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3130: break;
3131: default:
3132: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3133: }
3135: /* create cusparse matrix */
3136: PetscCall(MatSetSizes(C, m, n, m, n));
3137: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3138: c = (Mat_SeqAIJ *)C->data;
3139: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3140: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3141: Ccsr = new CsrMatrix;
3143: c->compressedrow.use = ciscompressed;
3144: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3145: c->compressedrow.nrows = a->compressedrow.nrows;
3146: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3147: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3148: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3149: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3150: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3151: } else {
3152: c->compressedrow.nrows = 0;
3153: c->compressedrow.i = NULL;
3154: c->compressedrow.rindex = NULL;
3155: Ccusp->workVector = NULL;
3156: Cmat->cprowIndices = NULL;
3157: }
3158: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3159: Ccusp->mat = Cmat;
3160: Ccusp->mat->mat = Ccsr;
3161: Ccsr->num_rows = Ccusp->nrows;
3162: Ccsr->num_cols = n;
3163: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3164: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3165: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3166: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3167: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3168: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3169: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3170: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3171: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3174: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3175: c->nz = 0;
3176: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3177: Ccsr->values = new THRUSTARRAY(c->nz);
3178: goto finalizesym;
3179: }
3181: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3182: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3183: Acsr = (CsrMatrix *)Amat->mat;
3184: if (!biscompressed) {
3185: Bcsr = (CsrMatrix *)Bmat->mat;
3186: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3187: BmatSpDescr = Bmat->matDescr;
3188: #endif
3189: } else { /* we need to use row offsets for the full matrix */
3190: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3191: Bcsr = new CsrMatrix;
3192: Bcsr->num_rows = B->rmap->n;
3193: Bcsr->num_cols = cBcsr->num_cols;
3194: Bcsr->num_entries = cBcsr->num_entries;
3195: Bcsr->column_indices = cBcsr->column_indices;
3196: Bcsr->values = cBcsr->values;
3197: if (!Bcusp->rowoffsets_gpu) {
3198: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3199: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3200: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3201: }
3202: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3203: mmdata->Bcsr = Bcsr;
3204: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3205: if (Bcsr->num_rows && Bcsr->num_cols) {
3206: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3207: PetscCallCUSPARSE(stat);
3208: }
3209: BmatSpDescr = mmdata->matSpBDescr;
3210: #endif
3211: }
3212: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3213: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3214: /* precompute flops count */
3215: if (ptype == MATPRODUCT_AB) {
3216: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3217: const PetscInt st = a->i[i];
3218: const PetscInt en = a->i[i + 1];
3219: for (j = st; j < en; j++) {
3220: const PetscInt brow = a->j[j];
3221: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3222: }
3223: }
3224: } else if (ptype == MATPRODUCT_AtB) {
3225: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3226: const PetscInt anzi = a->i[i + 1] - a->i[i];
3227: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3228: flops += (2. * anzi) * bnzi;
3229: }
3230: } else { /* TODO */
3231: flops = 0.;
3232: }
3234: mmdata->flops = flops;
3235: PetscCall(PetscLogGpuTimeBegin());
3237: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3238: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3239: // cuda-12.2 requires non-null csrRowOffsets
3240: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3241: PetscCallCUSPARSE(stat);
3242: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3243: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3244: {
3245: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3246: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3247: */
3248: void *dBuffer1 = NULL;
3249: void *dBuffer2 = NULL;
3250: void *dBuffer3 = NULL;
3251: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3252: size_t bufferSize1 = 0;
3253: size_t bufferSize2 = 0;
3254: size_t bufferSize3 = 0;
3255: size_t bufferSize4 = 0;
3256: size_t bufferSize5 = 0;
3258: /* ask bufferSize1 bytes for external memory */
3259: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3260: PetscCallCUSPARSE(stat);
3261: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3262: /* inspect the matrices A and B to understand the memory requirement for the next step */
3263: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3264: PetscCallCUSPARSE(stat);
3266: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3267: PetscCallCUSPARSE(stat);
3268: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3269: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3270: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3271: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3272: PetscCallCUSPARSE(stat);
3273: PetscCallCUDA(cudaFree(dBuffer1));
3274: PetscCallCUDA(cudaFree(dBuffer2));
3276: /* get matrix C non-zero entries C_nnz1 */
3277: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3278: c->nz = (PetscInt)C_nnz1;
3279: /* allocate matrix C */
3280: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3281: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3282: Ccsr->values = new THRUSTARRAY(c->nz);
3283: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3284: /* update matC with the new pointers */
3285: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3286: PetscCallCUSPARSE(stat);
3288: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3289: PetscCallCUSPARSE(stat);
3290: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3291: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3292: PetscCallCUSPARSE(stat);
3293: PetscCallCUDA(cudaFree(dBuffer3));
3294: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3295: PetscCallCUSPARSE(stat);
3296: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3297: }
3298: #else
3299: size_t bufSize2;
3300: /* ask bufferSize bytes for external memory */
3301: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3302: PetscCallCUSPARSE(stat);
3303: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3304: /* inspect the matrices A and B to understand the memory requirement for the next step */
3305: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3306: PetscCallCUSPARSE(stat);
3307: /* ask bufferSize again bytes for external memory */
3308: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3309: PetscCallCUSPARSE(stat);
3310: /* The CUSPARSE documentation is not clear, nor the API
3311: We need both buffers to perform the operations properly!
3312: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3313: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3314: is stored in the descriptor! What a messy API... */
3315: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3316: /* compute the intermediate product of A * B */
3317: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3318: PetscCallCUSPARSE(stat);
3319: /* get matrix C non-zero entries C_nnz1 */
3320: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3321: c->nz = (PetscInt)C_nnz1;
3322: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3323: mmdata->mmBufferSize / 1024));
3324: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3325: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3326: Ccsr->values = new THRUSTARRAY(c->nz);
3327: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3328: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3329: PetscCallCUSPARSE(stat);
3330: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3331: PetscCallCUSPARSE(stat);
3332: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3333: #else
3334: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3335: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3336: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3337: PetscCallCUSPARSE(stat);
3338: c->nz = cnz;
3339: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3340: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3341: Ccsr->values = new THRUSTARRAY(c->nz);
3342: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3344: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3345: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3346: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3347: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3348: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3349: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3350: PetscCallCUSPARSE(stat);
3351: #endif
3352: PetscCall(PetscLogGpuFlops(mmdata->flops));
3353: PetscCall(PetscLogGpuTimeEnd());
3354: finalizesym:
3355: c->free_a = PETSC_TRUE;
3356: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3357: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3358: c->free_ij = PETSC_TRUE;
3359: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3360: PetscInt *d_i = c->i;
3361: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3362: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3363: ii = *Ccsr->row_offsets;
3364: jj = *Ccsr->column_indices;
3365: if (ciscompressed) d_i = c->compressedrow.i;
3366: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368: } else {
3369: PetscInt *d_i = c->i;
3370: if (ciscompressed) d_i = c->compressedrow.i;
3371: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373: }
3374: if (ciscompressed) { /* need to expand host row offsets */
3375: PetscInt r = 0;
3376: c->i[0] = 0;
3377: for (k = 0; k < c->compressedrow.nrows; k++) {
3378: const PetscInt next = c->compressedrow.rindex[k];
3379: const PetscInt old = c->compressedrow.i[k];
3380: for (; r < next; r++) c->i[r + 1] = old;
3381: }
3382: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3383: }
3384: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3385: PetscCall(PetscMalloc1(m, &c->ilen));
3386: PetscCall(PetscMalloc1(m, &c->imax));
3387: c->maxnz = c->nz;
3388: c->nonzerorowcnt = 0;
3389: c->rmax = 0;
3390: for (k = 0; k < m; k++) {
3391: const PetscInt nn = c->i[k + 1] - c->i[k];
3392: c->ilen[k] = c->imax[k] = nn;
3393: c->nonzerorowcnt += (PetscInt)!!nn;
3394: c->rmax = PetscMax(c->rmax, nn);
3395: }
3396: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3397: PetscCall(PetscMalloc1(c->nz, &c->a));
3398: Ccsr->num_entries = c->nz;
3400: C->nonzerostate++;
3401: PetscCall(PetscLayoutSetUp(C->rmap));
3402: PetscCall(PetscLayoutSetUp(C->cmap));
3403: Ccusp->nonzerostate = C->nonzerostate;
3404: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3405: C->preallocated = PETSC_TRUE;
3406: C->assembled = PETSC_FALSE;
3407: C->was_assembled = PETSC_FALSE;
3408: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3409: mmdata->reusesym = PETSC_TRUE;
3410: C->offloadmask = PETSC_OFFLOAD_GPU;
3411: }
3412: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3413: PetscFunctionReturn(PETSC_SUCCESS);
3414: }
3416: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3418: /* handles sparse or dense B */
3419: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3420: {
3421: Mat_Product *product = mat->product;
3422: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3424: PetscFunctionBegin;
3425: MatCheckProduct(mat, 1);
3426: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3427: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3428: if (product->type == MATPRODUCT_ABC) {
3429: Ciscusp = PETSC_FALSE;
3430: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3431: }
3432: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3433: PetscBool usecpu = PETSC_FALSE;
3434: switch (product->type) {
3435: case MATPRODUCT_AB:
3436: if (product->api_user) {
3437: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3438: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3439: PetscOptionsEnd();
3440: } else {
3441: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3442: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3443: PetscOptionsEnd();
3444: }
3445: break;
3446: case MATPRODUCT_AtB:
3447: if (product->api_user) {
3448: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3449: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3450: PetscOptionsEnd();
3451: } else {
3452: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3453: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3454: PetscOptionsEnd();
3455: }
3456: break;
3457: case MATPRODUCT_PtAP:
3458: if (product->api_user) {
3459: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3460: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3461: PetscOptionsEnd();
3462: } else {
3463: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3464: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3465: PetscOptionsEnd();
3466: }
3467: break;
3468: case MATPRODUCT_RARt:
3469: if (product->api_user) {
3470: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3471: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3472: PetscOptionsEnd();
3473: } else {
3474: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3475: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3476: PetscOptionsEnd();
3477: }
3478: break;
3479: case MATPRODUCT_ABC:
3480: if (product->api_user) {
3481: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3482: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3483: PetscOptionsEnd();
3484: } else {
3485: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3486: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3487: PetscOptionsEnd();
3488: }
3489: break;
3490: default:
3491: break;
3492: }
3493: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3494: }
3495: /* dispatch */
3496: if (isdense) {
3497: switch (product->type) {
3498: case MATPRODUCT_AB:
3499: case MATPRODUCT_AtB:
3500: case MATPRODUCT_ABt:
3501: case MATPRODUCT_PtAP:
3502: case MATPRODUCT_RARt:
3503: if (product->A->boundtocpu) {
3504: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3505: } else {
3506: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3507: }
3508: break;
3509: case MATPRODUCT_ABC:
3510: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3511: break;
3512: default:
3513: break;
3514: }
3515: } else if (Biscusp && Ciscusp) {
3516: switch (product->type) {
3517: case MATPRODUCT_AB:
3518: case MATPRODUCT_AtB:
3519: case MATPRODUCT_ABt:
3520: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3521: break;
3522: case MATPRODUCT_PtAP:
3523: case MATPRODUCT_RARt:
3524: case MATPRODUCT_ABC:
3525: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3526: break;
3527: default:
3528: break;
3529: }
3530: } else { /* fallback for AIJ */
3531: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3532: }
3533: PetscFunctionReturn(PETSC_SUCCESS);
3534: }
3536: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3537: {
3538: PetscFunctionBegin;
3539: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3540: PetscFunctionReturn(PETSC_SUCCESS);
3541: }
3543: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3544: {
3545: PetscFunctionBegin;
3546: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3547: PetscFunctionReturn(PETSC_SUCCESS);
3548: }
3550: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3551: {
3552: PetscFunctionBegin;
3553: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3554: PetscFunctionReturn(PETSC_SUCCESS);
3555: }
3557: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3558: {
3559: PetscFunctionBegin;
3560: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3561: PetscFunctionReturn(PETSC_SUCCESS);
3562: }
3564: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3565: {
3566: PetscFunctionBegin;
3567: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3568: PetscFunctionReturn(PETSC_SUCCESS);
3569: }
3571: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3572: {
3573: int i = blockIdx.x * blockDim.x + threadIdx.x;
3574: if (i < n) y[idx[i]] += x[i];
3575: }
3577: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3578: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3579: {
3580: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3581: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3582: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3583: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3584: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3585: PetscBool compressed;
3586: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3587: PetscInt nx, ny;
3588: #endif
3590: PetscFunctionBegin;
3591: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3592: if (!a->nz) {
3593: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3594: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3595: PetscFunctionReturn(PETSC_SUCCESS);
3596: }
3597: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3598: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3599: if (!trans) {
3600: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3601: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3602: } else {
3603: if (herm || !A->form_explicit_transpose) {
3604: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3605: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3606: } else {
3607: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3608: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3609: }
3610: }
3611: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3612: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3614: try {
3615: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3616: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3617: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3619: PetscCall(PetscLogGpuTimeBegin());
3620: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3621: /* z = A x + beta y.
3622: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3623: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3624: */
3625: xptr = xarray;
3626: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3627: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3628: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3629: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3630: allocated to accommodate different uses. So we get the length info directly from mat.
3631: */
3632: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3633: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3634: nx = mat->num_cols; // since y = Ax
3635: ny = mat->num_rows;
3636: }
3637: #endif
3638: } else {
3639: /* z = A^T x + beta y
3640: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3641: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3642: */
3643: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3644: dptr = zarray;
3645: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3646: if (compressed) { /* Scatter x to work vector */
3647: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3649: thrust::for_each(
3650: #if PetscDefined(HAVE_THRUST_ASYNC)
3651: thrust::cuda::par.on(PetscDefaultCudaStream),
3652: #endif
3653: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3654: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3655: }
3656: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3657: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3658: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3659: nx = mat->num_rows; // since y = A^T x
3660: ny = mat->num_cols;
3661: }
3662: #endif
3663: }
3665: /* csr_spmv does y = alpha op(A) x + beta y */
3666: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3667: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3668: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3669: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3670: #else
3671: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3672: #endif
3674: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3675: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3676: if (!matDescr) {
3677: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3678: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3679: }
3680: #endif
3682: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3683: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3684: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3685: PetscCallCUSPARSE(
3686: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3687: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3688: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3689: PetscCallCUSPARSE(
3690: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3691: #endif
3692: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3693: } else {
3694: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3695: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3696: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3697: }
3699: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3700: #else
3701: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3702: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3703: #endif
3704: } else {
3705: if (cusparsestruct->nrows) {
3706: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3707: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3708: #else
3709: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3710: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3711: #endif
3712: }
3713: }
3714: PetscCall(PetscLogGpuTimeEnd());
3716: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3717: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3718: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3719: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3720: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3721: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3722: }
3723: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3724: PetscCall(VecSeq_CUDA::Set(zz, 0));
3725: }
3727: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3728: if (compressed) {
3729: PetscCall(PetscLogGpuTimeBegin());
3730: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3731: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3732: PetscCall(PetscLogGpuTimeEnd());
3733: }
3734: } else {
3735: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3736: }
3737: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3738: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3739: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3740: } catch (char *ex) {
3741: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3742: }
3743: if (yy) {
3744: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3745: } else {
3746: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3747: }
3748: PetscFunctionReturn(PETSC_SUCCESS);
3749: }
3751: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3752: {
3753: PetscFunctionBegin;
3754: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3755: PetscFunctionReturn(PETSC_SUCCESS);
3756: }
3758: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3759: {
3760: PetscFunctionBegin;
3761: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3762: PetscFunctionReturn(PETSC_SUCCESS);
3763: }
3765: /*@
3766: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3767: (the default parallel PETSc format).
3769: Collective
3771: Input Parameters:
3772: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3773: . m - number of rows
3774: . n - number of columns
3775: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3776: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3778: Output Parameter:
3779: . A - the matrix
3781: Level: intermediate
3783: Notes:
3784: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3785: calculations. For good matrix assembly performance the user should preallocate the matrix
3786: storage by setting the parameter `nz` (or the array `nnz`).
3788: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3789: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3790: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3792: The AIJ format, also called
3793: compressed row storage, is fully compatible with standard Fortran
3794: storage. That is, the stored row and column indices can begin at
3795: either one (as in Fortran) or zero.
3797: Specify the preallocated storage with either nz or nnz (not both).
3798: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3799: allocation.
3801: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3802: @*/
3803: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3804: {
3805: PetscFunctionBegin;
3806: PetscCall(MatCreate(comm, A));
3807: PetscCall(MatSetSizes(*A, m, n, m, n));
3808: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3809: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3810: PetscFunctionReturn(PETSC_SUCCESS);
3811: }
3813: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3814: {
3815: PetscFunctionBegin;
3816: if (A->factortype == MAT_FACTOR_NONE) {
3817: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3818: } else {
3819: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3820: }
3821: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3822: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3823: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3824: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3825: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3826: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3827: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3828: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3829: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3830: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3831: PetscCall(MatDestroy_SeqAIJ(A));
3832: PetscFunctionReturn(PETSC_SUCCESS);
3833: }
3835: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3836: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3837: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3838: {
3839: PetscFunctionBegin;
3840: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3841: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3842: PetscFunctionReturn(PETSC_SUCCESS);
3843: }
3845: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3846: {
3847: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3848: Mat_SeqAIJCUSPARSE *cy;
3849: Mat_SeqAIJCUSPARSE *cx;
3850: PetscScalar *ay;
3851: const PetscScalar *ax;
3852: CsrMatrix *csry, *csrx;
3854: PetscFunctionBegin;
3855: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3856: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3857: if (X->ops->axpy != Y->ops->axpy) {
3858: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3859: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3860: PetscFunctionReturn(PETSC_SUCCESS);
3861: }
3862: /* if we are here, it means both matrices are bound to GPU */
3863: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3864: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3865: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3866: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867: csry = (CsrMatrix *)cy->mat->mat;
3868: csrx = (CsrMatrix *)cx->mat->mat;
3869: /* see if we can turn this into a cublas axpy */
3870: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3871: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3872: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3873: if (eq) str = SAME_NONZERO_PATTERN;
3874: }
3875: /* spgeam is buggy with one column */
3876: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3878: if (str == SUBSET_NONZERO_PATTERN) {
3879: PetscScalar b = 1.0;
3880: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3881: size_t bufferSize;
3882: void *buffer;
3883: #endif
3885: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3886: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3887: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3888: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3889: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3890: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3891: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3892: PetscCall(PetscLogGpuTimeBegin());
3893: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3894: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3895: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3896: PetscCall(PetscLogGpuTimeEnd());
3897: PetscCallCUDA(cudaFree(buffer));
3898: #else
3899: PetscCall(PetscLogGpuTimeBegin());
3900: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3901: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3902: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3903: PetscCall(PetscLogGpuTimeEnd());
3904: #endif
3905: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3906: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3907: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3908: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3909: } else if (str == SAME_NONZERO_PATTERN) {
3910: cublasHandle_t cublasv2handle;
3911: PetscBLASInt one = 1, bnz = 1;
3913: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3914: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3915: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3916: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3917: PetscCall(PetscLogGpuTimeBegin());
3918: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3919: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3920: PetscCall(PetscLogGpuTimeEnd());
3921: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3922: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3923: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3924: } else {
3925: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3926: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3927: }
3928: PetscFunctionReturn(PETSC_SUCCESS);
3929: }
3931: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3932: {
3933: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3934: PetscScalar *ay;
3935: cublasHandle_t cublasv2handle;
3936: PetscBLASInt one = 1, bnz = 1;
3938: PetscFunctionBegin;
3939: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3940: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3941: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3942: PetscCall(PetscLogGpuTimeBegin());
3943: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3944: PetscCall(PetscLogGpuFlops(bnz));
3945: PetscCall(PetscLogGpuTimeEnd());
3946: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3947: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3948: PetscFunctionReturn(PETSC_SUCCESS);
3949: }
3951: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3952: {
3953: PetscBool both = PETSC_FALSE;
3954: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3956: PetscFunctionBegin;
3957: if (A->factortype == MAT_FACTOR_NONE) {
3958: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3959: if (spptr->mat) {
3960: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3961: if (matrix->values) {
3962: both = PETSC_TRUE;
3963: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3964: }
3965: }
3966: if (spptr->matTranspose) {
3967: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3968: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3969: }
3970: }
3971: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3972: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3973: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3974: else A->offloadmask = PETSC_OFFLOAD_CPU;
3975: PetscFunctionReturn(PETSC_SUCCESS);
3976: }
3978: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3979: {
3980: PetscFunctionBegin;
3981: *m = PETSC_MEMTYPE_CUDA;
3982: PetscFunctionReturn(PETSC_SUCCESS);
3983: }
3985: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3986: {
3987: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3989: PetscFunctionBegin;
3990: if (A->factortype != MAT_FACTOR_NONE) {
3991: A->boundtocpu = flg;
3992: PetscFunctionReturn(PETSC_SUCCESS);
3993: }
3994: if (flg) {
3995: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3997: A->ops->scale = MatScale_SeqAIJ;
3998: A->ops->axpy = MatAXPY_SeqAIJ;
3999: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
4000: A->ops->mult = MatMult_SeqAIJ;
4001: A->ops->multadd = MatMultAdd_SeqAIJ;
4002: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
4003: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
4004: A->ops->multhermitiantranspose = NULL;
4005: A->ops->multhermitiantransposeadd = NULL;
4006: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4007: A->ops->getcurrentmemtype = NULL;
4008: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4009: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4010: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4011: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4012: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4013: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4014: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4015: } else {
4016: A->ops->scale = MatScale_SeqAIJCUSPARSE;
4017: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4018: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4019: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4020: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4021: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4022: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4023: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4024: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4025: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4026: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4027: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4028: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4029: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4030: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4031: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4032: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4033: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4035: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4036: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4037: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4038: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4039: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4040: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4041: }
4042: A->boundtocpu = flg;
4043: if (flg && a->inode.size_csr) {
4044: a->inode.use = PETSC_TRUE;
4045: } else {
4046: a->inode.use = PETSC_FALSE;
4047: }
4048: PetscFunctionReturn(PETSC_SUCCESS);
4049: }
4051: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4052: {
4053: Mat B;
4055: PetscFunctionBegin;
4056: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4057: if (reuse == MAT_INITIAL_MATRIX) {
4058: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4059: } else if (reuse == MAT_REUSE_MATRIX) {
4060: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4061: }
4062: B = *newmat;
4064: PetscCall(PetscFree(B->defaultvectype));
4065: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4067: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4068: if (B->factortype == MAT_FACTOR_NONE) {
4069: Mat_SeqAIJCUSPARSE *spptr;
4070: PetscCall(PetscNew(&spptr));
4071: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4072: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4073: spptr->format = MAT_CUSPARSE_CSR;
4074: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4075: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4076: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4077: #else
4078: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4079: #endif
4080: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4081: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4082: #endif
4083: B->spptr = spptr;
4084: } else {
4085: Mat_SeqAIJCUSPARSETriFactors *spptr;
4087: PetscCall(PetscNew(&spptr));
4088: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4089: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4090: B->spptr = spptr;
4091: }
4092: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4093: }
4094: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4095: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4096: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4097: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4098: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4099: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4100: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4102: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4103: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4104: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4105: #if defined(PETSC_HAVE_HYPRE)
4106: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4107: #endif
4108: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4109: PetscFunctionReturn(PETSC_SUCCESS);
4110: }
4112: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4113: {
4114: PetscFunctionBegin;
4115: PetscCall(MatCreate_SeqAIJ(B));
4116: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4117: PetscFunctionReturn(PETSC_SUCCESS);
4118: }
4120: /*MC
4121: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4123: A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4124: CSR, ELL, or Hybrid format.
4125: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4127: Options Database Keys:
4128: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4129: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4130: Other options include ell (ellpack) or hyb (hybrid).
4131: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4132: - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4134: Level: beginner
4136: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4137: M*/
4139: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4140: {
4141: PetscFunctionBegin;
4142: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4143: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4144: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4145: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4146: PetscFunctionReturn(PETSC_SUCCESS);
4147: }
4149: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4150: {
4151: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4153: PetscFunctionBegin;
4154: if (cusp) {
4155: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4156: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4157: delete cusp->workVector;
4158: delete cusp->rowoffsets_gpu;
4159: delete cusp->csr2csc_i;
4160: delete cusp->coords;
4161: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4162: PetscCall(PetscFree(mat->spptr));
4163: }
4164: PetscFunctionReturn(PETSC_SUCCESS);
4165: }
4167: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4168: {
4169: PetscFunctionBegin;
4170: if (*mat) {
4171: delete (*mat)->values;
4172: delete (*mat)->column_indices;
4173: delete (*mat)->row_offsets;
4174: delete *mat;
4175: *mat = 0;
4176: }
4177: PetscFunctionReturn(PETSC_SUCCESS);
4178: }
4180: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4181: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4182: {
4183: PetscFunctionBegin;
4184: if (*trifactor) {
4185: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4186: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4187: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4188: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4189: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4190: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4191: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4192: #endif
4193: PetscCall(PetscFree(*trifactor));
4194: }
4195: PetscFunctionReturn(PETSC_SUCCESS);
4196: }
4197: #endif
4199: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4200: {
4201: CsrMatrix *mat;
4203: PetscFunctionBegin;
4204: if (*matstruct) {
4205: if ((*matstruct)->mat) {
4206: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4207: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4208: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4209: #else
4210: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4211: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4212: #endif
4213: } else {
4214: mat = (CsrMatrix *)(*matstruct)->mat;
4215: PetscCall(CsrMatrix_Destroy(&mat));
4216: }
4217: }
4218: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4219: delete (*matstruct)->cprowIndices;
4220: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4221: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4222: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4224: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4225: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4226: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4228: for (int i = 0; i < 3; i++) {
4229: if (mdata->cuSpMV[i].initialized) {
4230: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4231: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4232: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4233: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4234: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4235: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4236: #endif
4237: }
4238: }
4239: #endif
4240: delete *matstruct;
4241: *matstruct = NULL;
4242: }
4243: PetscFunctionReturn(PETSC_SUCCESS);
4244: }
4246: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4247: {
4248: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4250: PetscFunctionBegin;
4251: if (fs) {
4252: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4253: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4254: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4255: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4256: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4257: delete fs->workVector;
4258: fs->workVector = NULL;
4259: #endif
4260: delete fs->rpermIndices;
4261: delete fs->cpermIndices;
4262: fs->rpermIndices = NULL;
4263: fs->cpermIndices = NULL;
4264: fs->init_dev_prop = PETSC_FALSE;
4265: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4266: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4267: PetscCallCUDA(cudaFree(fs->csrColIdx));
4268: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4269: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4270: PetscCallCUDA(cudaFree(fs->csrVal));
4271: PetscCallCUDA(cudaFree(fs->diag));
4272: PetscCallCUDA(cudaFree(fs->X));
4273: PetscCallCUDA(cudaFree(fs->Y));
4274: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4275: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4276: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4277: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4278: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4279: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4280: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4281: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4282: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4283: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4284: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4285: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4286: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4287: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4288: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4289: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4290: PetscCall(PetscFree(fs->csrRowPtr_h));
4291: PetscCall(PetscFree(fs->csrVal_h));
4292: PetscCall(PetscFree(fs->diag_h));
4293: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4294: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4295: #endif
4296: }
4297: PetscFunctionReturn(PETSC_SUCCESS);
4298: }
4300: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4301: {
4302: PetscFunctionBegin;
4303: if (*trifactors) {
4304: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4305: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4306: PetscCall(PetscFree(*trifactors));
4307: }
4308: PetscFunctionReturn(PETSC_SUCCESS);
4309: }
4311: struct IJCompare {
4312: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4313: {
4314: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4315: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4316: return false;
4317: }
4318: };
4320: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4321: {
4322: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4324: PetscFunctionBegin;
4325: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4326: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4327: if (destroy) {
4328: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4329: delete cusp->csr2csc_i;
4330: cusp->csr2csc_i = NULL;
4331: }
4332: A->transupdated = PETSC_FALSE;
4333: PetscFunctionReturn(PETSC_SUCCESS);
4334: }
4336: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4337: {
4338: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4340: PetscFunctionBegin;
4341: PetscCallCUDA(cudaFree(coo->perm));
4342: PetscCallCUDA(cudaFree(coo->jmap));
4343: PetscCall(PetscFree(coo));
4344: PetscFunctionReturn(PETSC_SUCCESS);
4345: }
4347: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4348: {
4349: PetscBool dev_ij = PETSC_FALSE;
4350: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4351: PetscInt *i, *j;
4352: PetscContainer container_h;
4353: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4355: PetscFunctionBegin;
4356: PetscCall(PetscGetMemType(coo_i, &mtype));
4357: if (PetscMemTypeDevice(mtype)) {
4358: dev_ij = PETSC_TRUE;
4359: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4360: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4361: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4362: } else {
4363: i = coo_i;
4364: j = coo_j;
4365: }
4367: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4368: if (dev_ij) PetscCall(PetscFree2(i, j));
4369: mat->offloadmask = PETSC_OFFLOAD_CPU;
4370: // Create the GPU memory
4371: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4373: // Copy the COO struct to device
4374: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4375: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4376: PetscCall(PetscMalloc1(1, &coo_d));
4377: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4378: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4379: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4380: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4381: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4383: // Put the COO struct in a container and then attach that to the matrix
4384: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4385: PetscFunctionReturn(PETSC_SUCCESS);
4386: }
4388: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4389: {
4390: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4391: const PetscCount grid_size = gridDim.x * blockDim.x;
4392: for (; i < nnz; i += grid_size) {
4393: PetscScalar sum = 0.0;
4394: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4395: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4396: }
4397: }
4399: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4400: {
4401: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4402: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4403: PetscCount Annz = seq->nz;
4404: PetscMemType memtype;
4405: const PetscScalar *v1 = v;
4406: PetscScalar *Aa;
4407: PetscContainer container;
4408: MatCOOStruct_SeqAIJ *coo;
4410: PetscFunctionBegin;
4411: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4413: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4414: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4416: PetscCall(PetscGetMemType(v, &memtype));
4417: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4418: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4419: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4420: }
4422: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4423: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4425: PetscCall(PetscLogGpuTimeBegin());
4426: if (Annz) {
4427: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4428: PetscCallCUDA(cudaPeekAtLastError());
4429: }
4430: PetscCall(PetscLogGpuTimeEnd());
4432: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4433: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4435: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4436: PetscFunctionReturn(PETSC_SUCCESS);
4437: }
4439: /*@C
4440: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4442: Not Collective
4444: Input Parameters:
4445: + A - the matrix
4446: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4448: Output Parameters:
4449: + i - the CSR row pointers
4450: - j - the CSR column indices
4452: Level: developer
4454: Note:
4455: When compressed is true, the CSR structure does not contain empty rows
4457: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4458: @*/
4459: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4460: {
4461: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4462: CsrMatrix *csr;
4463: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4465: PetscFunctionBegin;
4467: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4468: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4469: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4470: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4471: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4472: csr = (CsrMatrix *)cusp->mat->mat;
4473: if (i) {
4474: if (!compressed && a->compressedrow.use) { /* need full row offset */
4475: if (!cusp->rowoffsets_gpu) {
4476: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4477: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4478: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4479: }
4480: *i = cusp->rowoffsets_gpu->data().get();
4481: } else *i = csr->row_offsets->data().get();
4482: }
4483: if (j) *j = csr->column_indices->data().get();
4484: PetscFunctionReturn(PETSC_SUCCESS);
4485: }
4487: /*@C
4488: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4490: Not Collective
4492: Input Parameters:
4493: + A - the matrix
4494: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4495: . i - the CSR row pointers
4496: - j - the CSR column indices
4498: Level: developer
4500: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4501: @*/
4502: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4503: {
4504: PetscFunctionBegin;
4506: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4507: if (i) *i = NULL;
4508: if (j) *j = NULL;
4509: (void)compressed;
4510: PetscFunctionReturn(PETSC_SUCCESS);
4511: }
4513: /*@C
4514: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4516: Not Collective
4518: Input Parameter:
4519: . A - a `MATSEQAIJCUSPARSE` matrix
4521: Output Parameter:
4522: . a - pointer to the device data
4524: Level: developer
4526: Note:
4527: May trigger host-device copies if up-to-date matrix data is on host
4529: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4530: @*/
4531: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4532: {
4533: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4534: CsrMatrix *csr;
4536: PetscFunctionBegin;
4538: PetscAssertPointer(a, 2);
4539: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4540: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4541: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4542: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4543: csr = (CsrMatrix *)cusp->mat->mat;
4544: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4545: *a = csr->values->data().get();
4546: PetscFunctionReturn(PETSC_SUCCESS);
4547: }
4549: /*@C
4550: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4552: Not Collective
4554: Input Parameters:
4555: + A - a `MATSEQAIJCUSPARSE` matrix
4556: - a - pointer to the device data
4558: Level: developer
4560: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4561: @*/
4562: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4563: {
4564: PetscFunctionBegin;
4566: PetscAssertPointer(a, 2);
4567: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4568: *a = NULL;
4569: PetscFunctionReturn(PETSC_SUCCESS);
4570: }
4572: /*@C
4573: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4575: Not Collective
4577: Input Parameter:
4578: . A - a `MATSEQAIJCUSPARSE` matrix
4580: Output Parameter:
4581: . a - pointer to the device data
4583: Level: developer
4585: Note:
4586: May trigger host-device copies if up-to-date matrix data is on host
4588: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4589: @*/
4590: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4591: {
4592: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4593: CsrMatrix *csr;
4595: PetscFunctionBegin;
4597: PetscAssertPointer(a, 2);
4598: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4600: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4601: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4602: csr = (CsrMatrix *)cusp->mat->mat;
4603: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4604: *a = csr->values->data().get();
4605: A->offloadmask = PETSC_OFFLOAD_GPU;
4606: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4607: PetscFunctionReturn(PETSC_SUCCESS);
4608: }
4609: /*@C
4610: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4612: Not Collective
4614: Input Parameters:
4615: + A - a `MATSEQAIJCUSPARSE` matrix
4616: - a - pointer to the device data
4618: Level: developer
4620: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4621: @*/
4622: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4623: {
4624: PetscFunctionBegin;
4626: PetscAssertPointer(a, 2);
4627: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4628: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4629: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4630: *a = NULL;
4631: PetscFunctionReturn(PETSC_SUCCESS);
4632: }
4634: /*@C
4635: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4637: Not Collective
4639: Input Parameter:
4640: . A - a `MATSEQAIJCUSPARSE` matrix
4642: Output Parameter:
4643: . a - pointer to the device data
4645: Level: developer
4647: Note:
4648: Does not trigger host-device copies and flags data validity on the GPU
4650: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4651: @*/
4652: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4653: {
4654: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4655: CsrMatrix *csr;
4657: PetscFunctionBegin;
4659: PetscAssertPointer(a, 2);
4660: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4661: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4662: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4663: csr = (CsrMatrix *)cusp->mat->mat;
4664: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4665: *a = csr->values->data().get();
4666: A->offloadmask = PETSC_OFFLOAD_GPU;
4667: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4668: PetscFunctionReturn(PETSC_SUCCESS);
4669: }
4671: /*@C
4672: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4674: Not Collective
4676: Input Parameters:
4677: + A - a `MATSEQAIJCUSPARSE` matrix
4678: - a - pointer to the device data
4680: Level: developer
4682: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4683: @*/
4684: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4685: {
4686: PetscFunctionBegin;
4688: PetscAssertPointer(a, 2);
4689: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4690: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4691: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4692: *a = NULL;
4693: PetscFunctionReturn(PETSC_SUCCESS);
4694: }
4696: struct IJCompare4 {
4697: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4698: {
4699: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4700: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4701: return false;
4702: }
4703: };
4705: struct Shift {
4706: int _shift;
4708: Shift(int shift) : _shift(shift) { }
4709: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4710: };
4712: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4713: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4714: {
4715: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4716: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4717: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4718: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4719: PetscInt Annz, Bnnz;
4720: cusparseStatus_t stat;
4721: PetscInt i, m, n, zero = 0;
4723: PetscFunctionBegin;
4726: PetscAssertPointer(C, 4);
4727: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4728: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4729: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4730: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4731: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4732: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733: if (reuse == MAT_INITIAL_MATRIX) {
4734: m = A->rmap->n;
4735: n = A->cmap->n + B->cmap->n;
4736: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4737: PetscCall(MatSetSizes(*C, m, n, m, n));
4738: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4739: c = (Mat_SeqAIJ *)(*C)->data;
4740: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4741: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4742: Ccsr = new CsrMatrix;
4743: Cmat->cprowIndices = NULL;
4744: c->compressedrow.use = PETSC_FALSE;
4745: c->compressedrow.nrows = 0;
4746: c->compressedrow.i = NULL;
4747: c->compressedrow.rindex = NULL;
4748: Ccusp->workVector = NULL;
4749: Ccusp->nrows = m;
4750: Ccusp->mat = Cmat;
4751: Ccusp->mat->mat = Ccsr;
4752: Ccsr->num_rows = m;
4753: Ccsr->num_cols = n;
4754: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4755: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4756: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4757: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4758: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4759: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4760: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4761: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4763: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4764: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4765: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4766: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4768: Acsr = (CsrMatrix *)Acusp->mat->mat;
4769: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4770: Annz = (PetscInt)Acsr->column_indices->size();
4771: Bnnz = (PetscInt)Bcsr->column_indices->size();
4772: c->nz = Annz + Bnnz;
4773: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4774: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4775: Ccsr->values = new THRUSTARRAY(c->nz);
4776: Ccsr->num_entries = c->nz;
4777: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4778: if (c->nz) {
4779: auto Acoo = new THRUSTINTARRAY32(Annz);
4780: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4781: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4782: THRUSTINTARRAY32 *Aroff, *Broff;
4784: if (a->compressedrow.use) { /* need full row offset */
4785: if (!Acusp->rowoffsets_gpu) {
4786: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4787: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4788: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4789: }
4790: Aroff = Acusp->rowoffsets_gpu;
4791: } else Aroff = Acsr->row_offsets;
4792: if (b->compressedrow.use) { /* need full row offset */
4793: if (!Bcusp->rowoffsets_gpu) {
4794: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4795: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4796: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4797: }
4798: Broff = Bcusp->rowoffsets_gpu;
4799: } else Broff = Bcsr->row_offsets;
4800: PetscCall(PetscLogGpuTimeBegin());
4801: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4802: PetscCallCUSPARSE(stat);
4803: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4804: PetscCallCUSPARSE(stat);
4805: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4806: auto Aperm = thrust::make_constant_iterator(1);
4807: auto Bperm = thrust::make_constant_iterator(0);
4808: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4809: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4810: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4811: #else
4812: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4813: auto Bcib = Bcsr->column_indices->begin();
4814: auto Bcie = Bcsr->column_indices->end();
4815: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4816: #endif
4817: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4818: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4819: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4820: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4821: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4822: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4823: auto p1 = Ccusp->coords->begin();
4824: auto p2 = Ccusp->coords->begin();
4825: thrust::advance(p2, Annz);
4826: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4827: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4828: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4829: #endif
4830: auto cci = thrust::make_counting_iterator(zero);
4831: auto cce = thrust::make_counting_iterator(c->nz);
4832: #if 0 //Errors on SUMMIT cuda 11.1.0
4833: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4834: #else
4835: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4836: auto pred = thrust::identity<int>();
4837: #else
4838: auto pred = cuda::std::identity();
4839: #endif
4840: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4841: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4842: #endif
4843: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4844: PetscCallCUSPARSE(stat);
4845: PetscCall(PetscLogGpuTimeEnd());
4846: delete wPerm;
4847: delete Acoo;
4848: delete Bcoo;
4849: delete Ccoo;
4850: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4851: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4852: PetscCallCUSPARSE(stat);
4853: #endif
4854: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4855: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4856: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4857: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4858: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4859: CsrMatrix *CcsrT = new CsrMatrix;
4860: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4861: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4863: (*C)->form_explicit_transpose = PETSC_TRUE;
4864: (*C)->transupdated = PETSC_TRUE;
4865: Ccusp->rowoffsets_gpu = NULL;
4866: CmatT->cprowIndices = NULL;
4867: CmatT->mat = CcsrT;
4868: CcsrT->num_rows = n;
4869: CcsrT->num_cols = m;
4870: CcsrT->num_entries = c->nz;
4872: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4873: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4874: CcsrT->values = new THRUSTARRAY(c->nz);
4876: PetscCall(PetscLogGpuTimeBegin());
4877: auto rT = CcsrT->row_offsets->begin();
4878: if (AT) {
4879: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4880: thrust::advance(rT, -1);
4881: }
4882: if (BT) {
4883: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4884: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4885: thrust::copy(titb, tite, rT);
4886: }
4887: auto cT = CcsrT->column_indices->begin();
4888: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4889: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4890: auto vT = CcsrT->values->begin();
4891: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4892: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4893: PetscCall(PetscLogGpuTimeEnd());
4895: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4896: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4897: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4898: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4899: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4900: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4901: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4902: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4903: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4904: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4905: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4906: PetscCallCUSPARSE(stat);
4907: #endif
4908: Ccusp->matTranspose = CmatT;
4909: }
4910: }
4912: c->free_a = PETSC_TRUE;
4913: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4914: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4915: c->free_ij = PETSC_TRUE;
4916: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4917: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4918: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4919: ii = *Ccsr->row_offsets;
4920: jj = *Ccsr->column_indices;
4921: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4922: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923: } else {
4924: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4925: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4926: }
4927: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4928: PetscCall(PetscMalloc1(m, &c->ilen));
4929: PetscCall(PetscMalloc1(m, &c->imax));
4930: c->maxnz = c->nz;
4931: c->nonzerorowcnt = 0;
4932: c->rmax = 0;
4933: for (i = 0; i < m; i++) {
4934: const PetscInt nn = c->i[i + 1] - c->i[i];
4935: c->ilen[i] = c->imax[i] = nn;
4936: c->nonzerorowcnt += (PetscInt)!!nn;
4937: c->rmax = PetscMax(c->rmax, nn);
4938: }
4939: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4940: PetscCall(PetscMalloc1(c->nz, &c->a));
4941: (*C)->nonzerostate++;
4942: PetscCall(PetscLayoutSetUp((*C)->rmap));
4943: PetscCall(PetscLayoutSetUp((*C)->cmap));
4944: Ccusp->nonzerostate = (*C)->nonzerostate;
4945: (*C)->preallocated = PETSC_TRUE;
4946: } else {
4947: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4948: c = (Mat_SeqAIJ *)(*C)->data;
4949: if (c->nz) {
4950: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4951: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4952: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4953: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4954: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4955: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4956: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4957: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4958: Acsr = (CsrMatrix *)Acusp->mat->mat;
4959: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4960: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4961: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4962: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4963: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4964: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4965: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4966: auto pmid = Ccusp->coords->begin();
4967: thrust::advance(pmid, Acsr->num_entries);
4968: PetscCall(PetscLogGpuTimeBegin());
4969: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4970: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4971: thrust::for_each(zibait, zieait, VecCUDAEquals());
4972: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4973: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4974: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4975: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4976: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4977: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4978: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4979: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4980: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4981: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4982: auto vT = CcsrT->values->begin();
4983: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4984: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4985: (*C)->transupdated = PETSC_TRUE;
4986: }
4987: PetscCall(PetscLogGpuTimeEnd());
4988: }
4989: }
4990: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4991: (*C)->assembled = PETSC_TRUE;
4992: (*C)->was_assembled = PETSC_FALSE;
4993: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4994: PetscFunctionReturn(PETSC_SUCCESS);
4995: }
4997: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4998: {
4999: bool dmem;
5000: const PetscScalar *av;
5002: PetscFunctionBegin;
5003: dmem = isCudaMem(v);
5004: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5005: if (n && idx) {
5006: THRUSTINTARRAY widx(n);
5007: widx.assign(idx, idx + n);
5008: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5010: THRUSTARRAY *w = NULL;
5011: thrust::device_ptr<PetscScalar> dv;
5012: if (dmem) {
5013: dv = thrust::device_pointer_cast(v);
5014: } else {
5015: w = new THRUSTARRAY(n);
5016: dv = w->data();
5017: }
5018: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5020: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5021: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5022: thrust::for_each(zibit, zieit, VecCUDAEquals());
5023: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5024: delete w;
5025: } else {
5026: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5027: }
5028: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5029: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5030: PetscFunctionReturn(PETSC_SUCCESS);
5031: }