Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #include <thrust/async/for_each.h>
19: #endif
20: #include <thrust/iterator/constant_iterator.h>
21: #include <thrust/remove.h>
22: #include <thrust/sort.h>
23: #include <thrust/unique.h>
25: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
31: typedef enum {
32: CUSPARSE_MV_ALG_DEFAULT = 0,
33: CUSPARSE_COOMV_ALG = 1,
34: CUSPARSE_CSRMV_ALG1 = 2,
35: CUSPARSE_CSRMV_ALG2 = 3
36: } cusparseSpMVAlg_t;
38: typedef enum {
39: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
41: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
42: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
43: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
44: CUSPARSE_SPMM_ALG_DEFAULT = 0,
45: CUSPARSE_SPMM_COO_ALG1 = 1,
46: CUSPARSE_SPMM_COO_ALG2 = 2,
47: CUSPARSE_SPMM_COO_ALG3 = 3,
48: CUSPARSE_SPMM_COO_ALG4 = 5,
49: CUSPARSE_SPMM_CSR_ALG1 = 4,
50: CUSPARSE_SPMM_CSR_ALG2 = 6,
51: } cusparseSpMMAlg_t;
53: typedef enum {
54: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
56: } cusparseCsr2CscAlg_t;
57: */
58: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61: #endif
63: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
68: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
69: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73: #endif
74: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
75: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
76: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
77: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
78: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
79: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
80: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
85: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
88: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
90: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
93: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98: {
99: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
101: PetscFunctionBegin;
102: switch (op) {
103: case MAT_CUSPARSE_MULT:
104: cusparsestruct->format = format;
105: break;
106: case MAT_CUSPARSE_ALL:
107: cusparsestruct->format = format;
108: break;
109: default:
110: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111: }
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: /*@
116: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
117: operation. Only the `MatMult()` operation can use different GPU storage formats
119: Not Collective
121: Input Parameters:
122: + A - Matrix of type `MATSEQAIJCUSPARSE`
123: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
124: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
125: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127: Level: intermediate
129: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130: @*/
131: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132: {
133: PetscFunctionBegin;
135: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140: {
141: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
143: PetscFunctionBegin;
144: cusparsestruct->use_cpu_solve = use_cpu;
145: PetscFunctionReturn(PETSC_SUCCESS);
146: }
148: /*@
149: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
151: Input Parameters:
152: + A - Matrix of type `MATSEQAIJCUSPARSE`
153: - use_cpu - set flag for using the built-in CPU `MatSolve()`
155: Level: intermediate
157: Note:
158: The cuSparse LU solver currently computes the factors with the built-in CPU method
159: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
162: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163: @*/
164: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165: {
166: PetscFunctionBegin;
168: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
169: PetscFunctionReturn(PETSC_SUCCESS);
170: }
172: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173: {
174: PetscFunctionBegin;
175: switch (op) {
176: case MAT_FORM_EXPLICIT_TRANSPOSE:
177: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
178: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
179: A->form_explicit_transpose = flg;
180: break;
181: default:
182: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183: break;
184: }
185: PetscFunctionReturn(PETSC_SUCCESS);
186: }
188: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
189: {
190: MatCUSPARSEStorageFormat format;
191: PetscBool flg;
192: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
194: PetscFunctionBegin;
195: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
196: if (A->factortype == MAT_FACTOR_NONE) {
197: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
198: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
200: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
201: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
202: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
203: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
205: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209: #else
210: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211: #endif
212: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
215: PetscCall(
216: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218: #endif
219: }
220: PetscOptionsHeadEnd();
221: PetscFunctionReturn(PETSC_SUCCESS);
222: }
224: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226: {
227: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
228: PetscInt m = A->rmap->n;
229: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231: const MatScalar *Aa = a->a;
232: PetscInt *Mi, *Mj, Mnz;
233: PetscScalar *Ma;
235: PetscFunctionBegin;
236: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239: Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240: PetscCall(PetscMalloc1(m + 1, &Mi));
241: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242: PetscCall(PetscMalloc1(Mnz, &Ma));
243: Mi[0] = 0;
244: for (PetscInt i = 0; i < m; i++) {
245: PetscInt llen = Ai[i + 1] - Ai[i];
246: PetscInt ulen = Adiag[i] - Adiag[i + 1];
247: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
248: Mj[Mi[i] + llen] = i; // diagonal entry
249: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250: Mi[i + 1] = Mi[i] + llen + ulen;
251: }
252: // Copy M (L,U) from host to device
253: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
259: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
265: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
266: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
268: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
272: fillMode = CUSPARSE_FILL_MODE_UPPER;
273: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
278: // Allocate work vectors in SpSv
279: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
282: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
285: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
293: // Record for reuse
294: fs->csrRowPtr_h = Mi;
295: fs->csrVal_h = Ma;
296: PetscCall(PetscFree(Mj));
297: }
298: // Copy the value
299: Mi = fs->csrRowPtr_h;
300: Ma = fs->csrVal_h;
301: Mnz = Mi[m];
302: for (PetscInt i = 0; i < m; i++) {
303: PetscInt llen = Ai[i + 1] - Ai[i];
304: PetscInt ulen = Adiag[i] - Adiag[i + 1];
305: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
306: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry
307: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308: }
309: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
311: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
312: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
313: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
314: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
315: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316: } else
317: #endif
318: {
319: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
320: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
322: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
323: fs->updatedSpSVAnalysis = PETSC_TRUE;
324: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
325: }
326: }
327: PetscFunctionReturn(PETSC_SUCCESS);
328: }
329: #else
330: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
331: {
332: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
333: PetscInt n = A->rmap->n;
334: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
335: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
336: const PetscInt *ai = a->i, *aj = a->j, *vi;
337: const MatScalar *aa = a->a, *v;
338: PetscInt *AiLo, *AjLo;
339: PetscInt i, nz, nzLower, offset, rowOffset;
341: PetscFunctionBegin;
342: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
343: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344: try {
345: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346: nzLower = n + ai[n] - ai[1];
347: if (!loTriFactor) {
348: PetscScalar *AALo;
350: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
352: /* Allocate Space for the lower triangular matrix */
353: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
354: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
356: /* Fill the lower triangular matrix */
357: AiLo[0] = (PetscInt)0;
358: AiLo[n] = nzLower;
359: AjLo[0] = (PetscInt)0;
360: AALo[0] = (MatScalar)1.0;
361: v = aa;
362: vi = aj;
363: offset = 1;
364: rowOffset = 1;
365: for (i = 1; i < n; i++) {
366: nz = ai[i + 1] - ai[i];
367: /* additional 1 for the term on the diagonal */
368: AiLo[i] = rowOffset;
369: rowOffset += nz + 1;
371: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
372: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
374: offset += nz;
375: AjLo[offset] = (PetscInt)i;
376: AALo[offset] = (MatScalar)1.0;
377: offset += 1;
379: v += nz;
380: vi += nz;
381: }
383: /* allocate space for the triangular factor information */
384: PetscCall(PetscNew(&loTriFactor));
385: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
386: /* Create the matrix description */
387: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
388: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
389: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
390: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
391: #else
392: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
393: #endif
394: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
395: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
397: /* set the operation */
398: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
400: /* set the matrix */
401: loTriFactor->csrMat = new CsrMatrix;
402: loTriFactor->csrMat->num_rows = n;
403: loTriFactor->csrMat->num_cols = n;
404: loTriFactor->csrMat->num_entries = nzLower;
406: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
407: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
409: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
412: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
415: /* Create the solve analysis information */
416: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
417: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
418: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
419: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
420: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
421: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
422: #endif
424: /* perform the solve analysis */
425: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
426: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
427: PetscCallCUDA(WaitForCUDA());
428: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
430: /* assign the pointer */
431: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
432: loTriFactor->AA_h = AALo;
433: PetscCallCUDA(cudaFreeHost(AiLo));
434: PetscCallCUDA(cudaFreeHost(AjLo));
435: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
436: } else { /* update values only */
437: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
438: /* Fill the lower triangular matrix */
439: loTriFactor->AA_h[0] = 1.0;
440: v = aa;
441: vi = aj;
442: offset = 1;
443: for (i = 1; i < n; i++) {
444: nz = ai[i + 1] - ai[i];
445: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
446: offset += nz;
447: loTriFactor->AA_h[offset] = 1.0;
448: offset += 1;
449: v += nz;
450: }
451: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
452: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
453: }
454: } catch (char *ex) {
455: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
456: }
457: }
458: PetscFunctionReturn(PETSC_SUCCESS);
459: }
461: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
462: {
463: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
464: PetscInt n = A->rmap->n;
465: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
466: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
467: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
468: const MatScalar *aa = a->a, *v;
469: PetscInt *AiUp, *AjUp;
470: PetscInt i, nz, nzUpper, offset;
472: PetscFunctionBegin;
473: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
474: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
475: try {
476: /* next, figure out the number of nonzeros in the upper triangular matrix. */
477: nzUpper = adiag[0] - adiag[n];
478: if (!upTriFactor) {
479: PetscScalar *AAUp;
481: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
483: /* Allocate Space for the upper triangular matrix */
484: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
485: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
487: /* Fill the upper triangular matrix */
488: AiUp[0] = (PetscInt)0;
489: AiUp[n] = nzUpper;
490: offset = nzUpper;
491: for (i = n - 1; i >= 0; i--) {
492: v = aa + adiag[i + 1] + 1;
493: vi = aj + adiag[i + 1] + 1;
495: /* number of elements NOT on the diagonal */
496: nz = adiag[i] - adiag[i + 1] - 1;
498: /* decrement the offset */
499: offset -= (nz + 1);
501: /* first, set the diagonal elements */
502: AjUp[offset] = (PetscInt)i;
503: AAUp[offset] = (MatScalar)1. / v[nz];
504: AiUp[i] = AiUp[i + 1] - (nz + 1);
506: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
507: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
508: }
510: /* allocate space for the triangular factor information */
511: PetscCall(PetscNew(&upTriFactor));
512: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
514: /* Create the matrix description */
515: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
516: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
517: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
518: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
519: #else
520: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
521: #endif
522: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
523: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
525: /* set the operation */
526: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
528: /* set the matrix */
529: upTriFactor->csrMat = new CsrMatrix;
530: upTriFactor->csrMat->num_rows = n;
531: upTriFactor->csrMat->num_cols = n;
532: upTriFactor->csrMat->num_entries = nzUpper;
534: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
535: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
537: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
538: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
540: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
541: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
543: /* Create the solve analysis information */
544: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
545: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
546: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
547: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
548: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
549: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
550: #endif
552: /* perform the solve analysis */
553: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
554: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
556: PetscCallCUDA(WaitForCUDA());
557: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
559: /* assign the pointer */
560: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
561: upTriFactor->AA_h = AAUp;
562: PetscCallCUDA(cudaFreeHost(AiUp));
563: PetscCallCUDA(cudaFreeHost(AjUp));
564: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
565: } else {
566: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
567: /* Fill the upper triangular matrix */
568: offset = nzUpper;
569: for (i = n - 1; i >= 0; i--) {
570: v = aa + adiag[i + 1] + 1;
572: /* number of elements NOT on the diagonal */
573: nz = adiag[i] - adiag[i + 1] - 1;
575: /* decrement the offset */
576: offset -= (nz + 1);
578: /* first, set the diagonal elements */
579: upTriFactor->AA_h[offset] = 1. / v[nz];
580: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
581: }
582: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
583: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
584: }
585: } catch (char *ex) {
586: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
587: }
588: }
589: PetscFunctionReturn(PETSC_SUCCESS);
590: }
591: #endif
593: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
594: {
595: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
596: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
597: IS isrow = a->row, isicol = a->icol;
598: PetscBool row_identity, col_identity;
599: PetscInt n = A->rmap->n;
601: PetscFunctionBegin;
602: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
603: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
604: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
605: #else
606: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
607: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
608: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
609: #endif
611: cusparseTriFactors->nnz = a->nz;
613: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
614: /* lower triangular indices */
615: PetscCall(ISIdentity(isrow, &row_identity));
616: if (!row_identity && !cusparseTriFactors->rpermIndices) {
617: const PetscInt *r;
619: PetscCall(ISGetIndices(isrow, &r));
620: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
621: cusparseTriFactors->rpermIndices->assign(r, r + n);
622: PetscCall(ISRestoreIndices(isrow, &r));
623: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
624: }
626: /* upper triangular indices */
627: PetscCall(ISIdentity(isicol, &col_identity));
628: if (!col_identity && !cusparseTriFactors->cpermIndices) {
629: const PetscInt *c;
631: PetscCall(ISGetIndices(isicol, &c));
632: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
633: cusparseTriFactors->cpermIndices->assign(c, c + n);
634: PetscCall(ISRestoreIndices(isicol, &c));
635: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
636: }
637: PetscFunctionReturn(PETSC_SUCCESS);
638: }
640: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
641: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
642: {
643: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
644: PetscInt m = A->rmap->n;
645: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
646: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
647: const MatScalar *Aa = a->a;
648: PetscInt *Mj, Mnz;
649: PetscScalar *Ma, *D;
651: PetscFunctionBegin;
652: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
653: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
654: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
655: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
656: Mnz = Ai[m]; // Unz (with the unit diagonal)
657: PetscCall(PetscMalloc1(Mnz, &Ma));
658: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
659: PetscCall(PetscMalloc1(m, &D)); // the diagonal
660: for (PetscInt i = 0; i < m; i++) {
661: PetscInt ulen = Ai[i + 1] - Ai[i];
662: Mj[Ai[i]] = i; // diagonal entry
663: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
664: }
665: // Copy M (U) from host to device
666: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
667: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
668: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
669: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
670: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
671: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
673: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
674: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
675: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
676: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
677: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
678: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
679: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
680: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
682: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
683: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
684: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
686: // Allocate work vectors in SpSv
687: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
688: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
690: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
691: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
693: // Query buffer sizes for SpSV and then allocate buffers
694: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
695: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
696: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
698: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
699: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
700: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
702: // Record for reuse
703: fs->csrVal_h = Ma;
704: fs->diag_h = D;
705: PetscCall(PetscFree(Mj));
706: }
707: // Copy the value
708: Ma = fs->csrVal_h;
709: D = fs->diag_h;
710: Mnz = Ai[m];
711: for (PetscInt i = 0; i < m; i++) {
712: D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
713: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
714: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
715: }
716: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
717: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
719: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
720: if (fs->updatedSpSVAnalysis) {
721: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
722: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723: } else
724: #endif
725: {
726: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
727: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
728: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
729: fs->updatedSpSVAnalysis = PETSC_TRUE;
730: }
731: }
732: PetscFunctionReturn(PETSC_SUCCESS);
733: }
735: // Solve Ut D U x = b
736: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
737: {
738: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
739: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
740: const PetscScalar *barray;
741: PetscScalar *xarray;
742: thrust::device_ptr<const PetscScalar> bGPU;
743: thrust::device_ptr<PetscScalar> xGPU;
744: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
745: PetscInt m = A->rmap->n;
747: PetscFunctionBegin;
748: PetscCall(PetscLogGpuTimeBegin());
749: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
750: PetscCall(VecCUDAGetArrayRead(b, &barray));
751: xGPU = thrust::device_pointer_cast(xarray);
752: bGPU = thrust::device_pointer_cast(barray);
754: // Reorder b with the row permutation if needed, and wrap the result in fs->X
755: if (fs->rpermIndices) {
756: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
757: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
758: } else {
759: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
760: }
762: // Solve Ut Y = X
763: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
764: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
766: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
767: // It is basically a vector element-wise multiplication, but cublas does not have it!
768: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
770: // Solve U X = Y
771: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
772: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
773: } else {
774: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
775: }
776: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
778: // Reorder X with the column permutation if needed, and put the result back to x
779: if (fs->cpermIndices) {
780: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
781: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
782: }
784: PetscCall(VecCUDARestoreArrayRead(b, &barray));
785: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
786: PetscCall(PetscLogGpuTimeEnd());
787: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
788: PetscFunctionReturn(PETSC_SUCCESS);
789: }
790: #else
791: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
792: {
793: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
794: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
795: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
796: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
797: PetscInt *AiUp, *AjUp;
798: PetscScalar *AAUp;
799: PetscScalar *AALo;
800: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
801: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
802: const PetscInt *ai = b->i, *aj = b->j, *vj;
803: const MatScalar *aa = b->a, *v;
805: PetscFunctionBegin;
806: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
807: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
808: try {
809: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
810: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
811: if (!upTriFactor && !loTriFactor) {
812: /* Allocate Space for the upper triangular matrix */
813: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
814: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
816: /* Fill the upper triangular matrix */
817: AiUp[0] = (PetscInt)0;
818: AiUp[n] = nzUpper;
819: offset = 0;
820: for (i = 0; i < n; i++) {
821: /* set the pointers */
822: v = aa + ai[i];
823: vj = aj + ai[i];
824: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
826: /* first, set the diagonal elements */
827: AjUp[offset] = (PetscInt)i;
828: AAUp[offset] = (MatScalar)1.0 / v[nz];
829: AiUp[i] = offset;
830: AALo[offset] = (MatScalar)1.0 / v[nz];
832: offset += 1;
833: if (nz > 0) {
834: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
835: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
836: for (j = offset; j < offset + nz; j++) {
837: AAUp[j] = -AAUp[j];
838: AALo[j] = AAUp[j] / v[nz];
839: }
840: offset += nz;
841: }
842: }
844: /* allocate space for the triangular factor information */
845: PetscCall(PetscNew(&upTriFactor));
846: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
848: /* Create the matrix description */
849: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
850: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
851: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
852: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
853: #else
854: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
855: #endif
856: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
857: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
859: /* set the matrix */
860: upTriFactor->csrMat = new CsrMatrix;
861: upTriFactor->csrMat->num_rows = A->rmap->n;
862: upTriFactor->csrMat->num_cols = A->cmap->n;
863: upTriFactor->csrMat->num_entries = a->nz;
865: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
866: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
868: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
869: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
871: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
872: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
874: /* set the operation */
875: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
877: /* Create the solve analysis information */
878: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
879: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
880: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
881: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
882: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
883: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
884: #endif
886: /* perform the solve analysis */
887: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
888: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
890: PetscCallCUDA(WaitForCUDA());
891: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
893: /* assign the pointer */
894: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
896: /* allocate space for the triangular factor information */
897: PetscCall(PetscNew(&loTriFactor));
898: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
900: /* Create the matrix description */
901: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
902: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
903: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
904: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
905: #else
906: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
907: #endif
908: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
909: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
911: /* set the operation */
912: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
914: /* set the matrix */
915: loTriFactor->csrMat = new CsrMatrix;
916: loTriFactor->csrMat->num_rows = A->rmap->n;
917: loTriFactor->csrMat->num_cols = A->cmap->n;
918: loTriFactor->csrMat->num_entries = a->nz;
920: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
921: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
923: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
924: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
926: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
927: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
929: /* Create the solve analysis information */
930: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
931: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
932: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
933: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
934: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
935: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
936: #endif
938: /* perform the solve analysis */
939: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
940: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
942: PetscCallCUDA(WaitForCUDA());
943: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
945: /* assign the pointer */
946: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
948: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
949: PetscCallCUDA(cudaFreeHost(AiUp));
950: PetscCallCUDA(cudaFreeHost(AjUp));
951: } else {
952: /* Fill the upper triangular matrix */
953: offset = 0;
954: for (i = 0; i < n; i++) {
955: /* set the pointers */
956: v = aa + ai[i];
957: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
959: /* first, set the diagonal elements */
960: AAUp[offset] = 1.0 / v[nz];
961: AALo[offset] = 1.0 / v[nz];
963: offset += 1;
964: if (nz > 0) {
965: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
966: for (j = offset; j < offset + nz; j++) {
967: AAUp[j] = -AAUp[j];
968: AALo[j] = AAUp[j] / v[nz];
969: }
970: offset += nz;
971: }
972: }
973: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
974: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
976: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
977: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
978: }
979: PetscCallCUDA(cudaFreeHost(AAUp));
980: PetscCallCUDA(cudaFreeHost(AALo));
981: } catch (char *ex) {
982: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
983: }
984: }
985: PetscFunctionReturn(PETSC_SUCCESS);
986: }
987: #endif
989: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
990: {
991: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
992: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
993: IS ip = a->row;
994: PetscBool perm_identity;
995: PetscInt n = A->rmap->n;
997: PetscFunctionBegin;
998: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1000: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1001: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1002: #else
1003: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1004: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1005: #endif
1006: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1008: A->offloadmask = PETSC_OFFLOAD_BOTH;
1010: /* lower triangular indices */
1011: PetscCall(ISIdentity(ip, &perm_identity));
1012: if (!perm_identity) {
1013: IS iip;
1014: const PetscInt *irip, *rip;
1016: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1017: PetscCall(ISGetIndices(iip, &irip));
1018: PetscCall(ISGetIndices(ip, &rip));
1019: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1020: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1021: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1022: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1023: PetscCall(ISRestoreIndices(iip, &irip));
1024: PetscCall(ISDestroy(&iip));
1025: PetscCall(ISRestoreIndices(ip, &rip));
1026: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1027: }
1028: PetscFunctionReturn(PETSC_SUCCESS);
1029: }
1031: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1032: {
1033: PetscFunctionBegin;
1034: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1035: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1036: B->offloadmask = PETSC_OFFLOAD_CPU;
1038: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1039: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1040: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041: #else
1042: /* determine which version of MatSolve needs to be used. */
1043: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1044: IS ip = b->row;
1045: PetscBool perm_identity;
1047: PetscCall(ISIdentity(ip, &perm_identity));
1048: if (perm_identity) {
1049: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1050: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1051: } else {
1052: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1053: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1054: }
1055: #endif
1056: B->ops->matsolve = NULL;
1057: B->ops->matsolvetranspose = NULL;
1059: /* get the triangular factors */
1060: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1061: PetscFunctionReturn(PETSC_SUCCESS);
1062: }
1064: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1065: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1066: {
1067: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1068: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1069: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1070: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1071: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1072: cusparseIndexBase_t indexBase;
1073: cusparseMatrixType_t matrixType;
1074: cusparseFillMode_t fillMode;
1075: cusparseDiagType_t diagType;
1077: PetscFunctionBegin;
1078: /* allocate space for the transpose of the lower triangular factor */
1079: PetscCall(PetscNew(&loTriFactorT));
1080: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1082: /* set the matrix descriptors of the lower triangular factor */
1083: matrixType = cusparseGetMatType(loTriFactor->descr);
1084: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1085: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1086: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1088: /* Create the matrix description */
1089: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1090: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1091: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1092: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1093: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1095: /* set the operation */
1096: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1098: /* allocate GPU space for the CSC of the lower triangular factor*/
1099: loTriFactorT->csrMat = new CsrMatrix;
1100: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1101: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1102: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1103: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1104: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1105: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1107: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1108: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1109: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1110: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1111: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1112: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1113: #endif
1115: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1116: {
1117: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1118: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1119: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1120: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1121: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1122: #else
1123: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1124: #endif
1125: PetscCallCUSPARSE(stat);
1126: }
1128: PetscCallCUDA(WaitForCUDA());
1129: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1131: /* Create the solve analysis information */
1132: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1133: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1134: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1135: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1136: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1137: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1138: #endif
1140: /* perform the solve analysis */
1141: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1142: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1144: PetscCallCUDA(WaitForCUDA());
1145: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1147: /* assign the pointer */
1148: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1150: /*********************************************/
1151: /* Now the Transpose of the Upper Tri Factor */
1152: /*********************************************/
1154: /* allocate space for the transpose of the upper triangular factor */
1155: PetscCall(PetscNew(&upTriFactorT));
1156: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1158: /* set the matrix descriptors of the upper triangular factor */
1159: matrixType = cusparseGetMatType(upTriFactor->descr);
1160: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1161: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1162: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1164: /* Create the matrix description */
1165: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1166: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1167: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1168: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1169: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1171: /* set the operation */
1172: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1174: /* allocate GPU space for the CSC of the upper triangular factor*/
1175: upTriFactorT->csrMat = new CsrMatrix;
1176: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1177: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1178: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1179: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1180: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1181: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1183: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1184: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1185: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1186: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1187: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1188: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1189: #endif
1191: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1192: {
1193: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1194: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1195: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1196: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1197: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1198: #else
1199: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1200: #endif
1201: PetscCallCUSPARSE(stat);
1202: }
1204: PetscCallCUDA(WaitForCUDA());
1205: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1207: /* Create the solve analysis information */
1208: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1209: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1210: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1211: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1212: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1213: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1214: #endif
1216: /* perform the solve analysis */
1217: /* christ, would it have killed you to put this stuff in a function????????? */
1218: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1219: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1221: PetscCallCUDA(WaitForCUDA());
1222: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1224: /* assign the pointer */
1225: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1226: PetscFunctionReturn(PETSC_SUCCESS);
1227: }
1228: #endif
1230: struct PetscScalarToPetscInt {
1231: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1232: };
1234: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1235: {
1236: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1237: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1238: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1239: cusparseStatus_t stat;
1240: cusparseIndexBase_t indexBase;
1242: PetscFunctionBegin;
1243: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1244: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1245: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1246: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1247: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1248: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1249: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1250: PetscCall(PetscLogGpuTimeBegin());
1251: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1252: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1253: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1254: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1255: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1256: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1257: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1259: /* set alpha and beta */
1260: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1261: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1262: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1263: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1264: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1265: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1267: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1268: CsrMatrix *matrixT = new CsrMatrix;
1269: matstructT->mat = matrixT;
1270: matrixT->num_rows = A->cmap->n;
1271: matrixT->num_cols = A->rmap->n;
1272: matrixT->num_entries = a->nz;
1273: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1274: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1275: matrixT->values = new THRUSTARRAY(a->nz);
1277: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1278: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1280: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1281: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1282: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1283: indexBase, cusparse_scalartype);
1284: PetscCallCUSPARSE(stat);
1285: #else
1286: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1287: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1289: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1290: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1291: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1292: */
1293: if (matrixT->num_entries) {
1294: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1295: PetscCallCUSPARSE(stat);
1297: } else {
1298: matstructT->matDescr = NULL;
1299: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1300: }
1301: #endif
1302: #endif
1303: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1304: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1305: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1306: #else
1307: CsrMatrix *temp = new CsrMatrix;
1308: CsrMatrix *tempT = new CsrMatrix;
1309: /* First convert HYB to CSR */
1310: temp->num_rows = A->rmap->n;
1311: temp->num_cols = A->cmap->n;
1312: temp->num_entries = a->nz;
1313: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1314: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1315: temp->values = new THRUSTARRAY(a->nz);
1317: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1318: PetscCallCUSPARSE(stat);
1320: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1321: tempT->num_rows = A->rmap->n;
1322: tempT->num_cols = A->cmap->n;
1323: tempT->num_entries = a->nz;
1324: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1325: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1326: tempT->values = new THRUSTARRAY(a->nz);
1328: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1329: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1330: PetscCallCUSPARSE(stat);
1332: /* Last, convert CSC to HYB */
1333: cusparseHybMat_t hybMat;
1334: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1335: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1336: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1337: PetscCallCUSPARSE(stat);
1339: /* assign the pointer */
1340: matstructT->mat = hybMat;
1341: A->transupdated = PETSC_TRUE;
1342: /* delete temporaries */
1343: if (tempT) {
1344: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1345: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1346: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1347: delete (CsrMatrix *)tempT;
1348: }
1349: if (temp) {
1350: if (temp->values) delete (THRUSTARRAY *)temp->values;
1351: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1352: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1353: delete (CsrMatrix *)temp;
1354: }
1355: #endif
1356: }
1357: }
1358: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1359: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1360: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1361: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1362: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1363: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1364: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1365: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1366: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1367: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1368: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1369: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1370: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1371: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1372: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1373: }
1374: if (!cusparsestruct->csr2csc_i) {
1375: THRUSTARRAY csr2csc_a(matrix->num_entries);
1376: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1378: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1379: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380: void *csr2cscBuffer;
1381: size_t csr2cscBufferSize;
1382: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1383: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1384: PetscCallCUSPARSE(stat);
1385: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1386: #endif
1388: if (matrix->num_entries) {
1389: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1390: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1391: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1393: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1394: should be filled with indexBase. So I just take a shortcut here.
1395: */
1396: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1397: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1398: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1399: PetscCallCUSPARSE(stat);
1400: #else
1401: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1402: PetscCallCUSPARSE(stat);
1403: #endif
1404: } else {
1405: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1406: }
1408: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1410: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1411: PetscCallCUDA(cudaFree(csr2cscBuffer));
1412: #endif
1413: }
1414: PetscCallThrust(
1415: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1416: }
1417: PetscCall(PetscLogGpuTimeEnd());
1418: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1419: /* the compressed row indices is not used for matTranspose */
1420: matstructT->cprowIndices = NULL;
1421: /* assign the pointer */
1422: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1423: A->transupdated = PETSC_TRUE;
1424: PetscFunctionReturn(PETSC_SUCCESS);
1425: }
1427: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1428: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1429: {
1430: const PetscScalar *barray;
1431: PetscScalar *xarray;
1432: thrust::device_ptr<const PetscScalar> bGPU;
1433: thrust::device_ptr<PetscScalar> xGPU;
1434: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1435: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1436: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1437: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1438: PetscInt m = A->rmap->n;
1440: PetscFunctionBegin;
1441: PetscCall(PetscLogGpuTimeBegin());
1442: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1443: PetscCall(VecCUDAGetArrayRead(b, &barray));
1444: xGPU = thrust::device_pointer_cast(xarray);
1445: bGPU = thrust::device_pointer_cast(barray);
1447: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1448: if (fs->rpermIndices) {
1449: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1450: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1451: } else {
1452: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1453: }
1455: // Solve L Y = X
1456: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1457: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1458: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1460: // Solve U X = Y
1461: if (fs->cpermIndices) {
1462: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1463: } else {
1464: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1465: }
1466: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1468: // Reorder X with the column permutation if needed, and put the result back to x
1469: if (fs->cpermIndices) {
1470: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1471: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1472: }
1473: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1474: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1475: PetscCall(PetscLogGpuTimeEnd());
1476: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1477: PetscFunctionReturn(PETSC_SUCCESS);
1478: }
1480: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1481: {
1482: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1483: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1484: const PetscScalar *barray;
1485: PetscScalar *xarray;
1486: thrust::device_ptr<const PetscScalar> bGPU;
1487: thrust::device_ptr<PetscScalar> xGPU;
1488: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1489: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1490: PetscInt m = A->rmap->n;
1492: PetscFunctionBegin;
1493: PetscCall(PetscLogGpuTimeBegin());
1494: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1495: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1496: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1497: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1499: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1500: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1501: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1502: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1503: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1504: }
1506: if (!fs->updatedTransposeSpSVAnalysis) {
1507: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1509: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1510: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1511: }
1513: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1514: PetscCall(VecCUDAGetArrayRead(b, &barray));
1515: xGPU = thrust::device_pointer_cast(xarray);
1516: bGPU = thrust::device_pointer_cast(barray);
1518: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1519: if (fs->rpermIndices) {
1520: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1521: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1522: } else {
1523: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1524: }
1526: // Solve Ut Y = X
1527: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1528: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1530: // Solve Lt X = Y
1531: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1532: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1533: } else {
1534: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1535: }
1536: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1538: // Reorder X with the column permutation if needed, and put the result back to x
1539: if (fs->cpermIndices) {
1540: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1541: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1542: }
1544: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1545: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1546: PetscCall(PetscLogGpuTimeEnd());
1547: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1548: PetscFunctionReturn(PETSC_SUCCESS);
1549: }
1550: #else
1551: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1552: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1553: {
1554: PetscInt n = xx->map->n;
1555: const PetscScalar *barray;
1556: PetscScalar *xarray;
1557: thrust::device_ptr<const PetscScalar> bGPU;
1558: thrust::device_ptr<PetscScalar> xGPU;
1559: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1560: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1561: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1562: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1564: PetscFunctionBegin;
1565: /* Analyze the matrix and create the transpose ... on the fly */
1566: if (!loTriFactorT && !upTriFactorT) {
1567: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1568: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1569: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1570: }
1572: /* Get the GPU pointers */
1573: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1574: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1575: xGPU = thrust::device_pointer_cast(xarray);
1576: bGPU = thrust::device_pointer_cast(barray);
1578: PetscCall(PetscLogGpuTimeBegin());
1579: /* First, reorder with the row permutation */
1580: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1582: /* First, solve U */
1583: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1584: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1586: /* Then, solve L */
1587: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1588: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1590: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1591: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1593: /* Copy the temporary to the full solution. */
1594: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1596: /* restore */
1597: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1598: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1599: PetscCall(PetscLogGpuTimeEnd());
1600: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1601: PetscFunctionReturn(PETSC_SUCCESS);
1602: }
1604: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1605: {
1606: const PetscScalar *barray;
1607: PetscScalar *xarray;
1608: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1609: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1610: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1611: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1613: PetscFunctionBegin;
1614: /* Analyze the matrix and create the transpose ... on the fly */
1615: if (!loTriFactorT && !upTriFactorT) {
1616: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1617: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1618: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1619: }
1621: /* Get the GPU pointers */
1622: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1623: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1625: PetscCall(PetscLogGpuTimeBegin());
1626: /* First, solve U */
1627: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1628: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1630: /* Then, solve L */
1631: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1632: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1634: /* restore */
1635: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1636: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1637: PetscCall(PetscLogGpuTimeEnd());
1638: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1639: PetscFunctionReturn(PETSC_SUCCESS);
1640: }
1642: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1643: {
1644: const PetscScalar *barray;
1645: PetscScalar *xarray;
1646: thrust::device_ptr<const PetscScalar> bGPU;
1647: thrust::device_ptr<PetscScalar> xGPU;
1648: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1649: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1650: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1651: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1653: PetscFunctionBegin;
1654: /* Get the GPU pointers */
1655: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1656: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1657: xGPU = thrust::device_pointer_cast(xarray);
1658: bGPU = thrust::device_pointer_cast(barray);
1660: PetscCall(PetscLogGpuTimeBegin());
1661: /* First, reorder with the row permutation */
1662: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1664: /* Next, solve L */
1665: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1666: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1668: /* Then, solve U */
1669: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1670: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1672: /* Last, reorder with the column permutation */
1673: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1675: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1676: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1677: PetscCall(PetscLogGpuTimeEnd());
1678: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1679: PetscFunctionReturn(PETSC_SUCCESS);
1680: }
1682: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1683: {
1684: const PetscScalar *barray;
1685: PetscScalar *xarray;
1686: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1687: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1688: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1689: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1691: PetscFunctionBegin;
1692: /* Get the GPU pointers */
1693: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1694: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1696: PetscCall(PetscLogGpuTimeBegin());
1697: /* First, solve L */
1698: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1699: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1701: /* Next, solve U */
1702: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1703: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1705: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1706: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1707: PetscCall(PetscLogGpuTimeEnd());
1708: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1709: PetscFunctionReturn(PETSC_SUCCESS);
1710: }
1711: #endif
1713: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1714: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1715: {
1716: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1717: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1718: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1719: CsrMatrix *Acsr;
1720: PetscInt m, nz;
1721: PetscBool flg;
1723: PetscFunctionBegin;
1724: if (PetscDefined(USE_DEBUG)) {
1725: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1726: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1727: }
1729: /* Copy A's value to fact */
1730: m = fact->rmap->n;
1731: nz = aij->nz;
1732: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1733: Acsr = (CsrMatrix *)Acusp->mat->mat;
1734: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1736: PetscCall(PetscLogGpuTimeBegin());
1737: /* Factorize fact inplace */
1738: if (m)
1739: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1740: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1741: if (PetscDefined(USE_DEBUG)) {
1742: int numerical_zero;
1743: cusparseStatus_t status;
1744: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1745: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1746: }
1748: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1749: if (fs->updatedSpSVAnalysis) {
1750: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1751: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752: } else
1753: #endif
1754: {
1755: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1756: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1757: */
1758: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1760: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1762: fs->updatedSpSVAnalysis = PETSC_TRUE;
1763: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1764: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1765: }
1767: fact->offloadmask = PETSC_OFFLOAD_GPU;
1768: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1769: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1770: fact->ops->matsolve = NULL;
1771: fact->ops->matsolvetranspose = NULL;
1772: PetscCall(PetscLogGpuTimeEnd());
1773: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1774: PetscFunctionReturn(PETSC_SUCCESS);
1775: }
1777: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1778: {
1779: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1780: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1781: PetscInt m, nz;
1783: PetscFunctionBegin;
1784: if (PetscDefined(USE_DEBUG)) {
1785: PetscInt i;
1786: PetscBool flg, missing;
1788: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1789: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1790: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1791: PetscCall(MatMissingDiagonal(A, &missing, &i));
1792: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1793: }
1795: /* Free the old stale stuff */
1796: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1798: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1799: but they will not be used. Allocate them just for easy debugging.
1800: */
1801: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1803: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1804: fact->factortype = MAT_FACTOR_ILU;
1805: fact->info.factor_mallocs = 0;
1806: fact->info.fill_ratio_given = info->fill;
1807: fact->info.fill_ratio_needed = 1.0;
1809: aij->row = NULL;
1810: aij->col = NULL;
1812: /* ====================================================================== */
1813: /* Copy A's i, j to fact and also allocate the value array of fact. */
1814: /* We'll do in-place factorization on fact */
1815: /* ====================================================================== */
1816: const int *Ai, *Aj;
1818: m = fact->rmap->n;
1819: nz = aij->nz;
1821: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1822: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1823: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1824: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1825: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1826: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828: /* ====================================================================== */
1829: /* Create descriptors for M, L, U */
1830: /* ====================================================================== */
1831: cusparseFillMode_t fillMode;
1832: cusparseDiagType_t diagType;
1834: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1835: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1836: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1838: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1839: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1840: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1841: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1842: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1843: */
1844: fillMode = CUSPARSE_FILL_MODE_LOWER;
1845: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1846: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1847: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1848: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1850: fillMode = CUSPARSE_FILL_MODE_UPPER;
1851: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1852: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1853: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1854: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1856: /* ========================================================================= */
1857: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1858: /* ========================================================================= */
1859: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1860: if (m)
1861: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1862: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1864: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1865: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1867: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1868: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1870: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1871: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1873: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1874: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1876: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1877: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1878: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1879: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1880: */
1881: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1882: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1883: fs->spsvBuffer_L = fs->factBuffer_M;
1884: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1885: } else {
1886: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1887: fs->spsvBuffer_U = fs->factBuffer_M;
1888: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1889: }
1891: /* ========================================================================== */
1892: /* Perform analysis of ilu0 on M, SpSv on L and U */
1893: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1894: /* ========================================================================== */
1895: int structural_zero;
1896: cusparseStatus_t status;
1898: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1899: if (m)
1900: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1901: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1902: if (PetscDefined(USE_DEBUG)) {
1903: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1904: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1905: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1906: }
1908: /* Estimate FLOPs of the numeric factorization */
1909: {
1910: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1911: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1912: PetscLogDouble flops = 0.0;
1914: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1915: Ai = Aseq->i;
1916: Adiag = Aseq->diag;
1917: for (PetscInt i = 0; i < m; i++) {
1918: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1919: nzRow = Ai[i + 1] - Ai[i];
1920: nzLeft = Adiag[i] - Ai[i];
1921: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1922: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1923: */
1924: nzLeft = (nzRow - 1) / 2;
1925: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1926: }
1927: }
1928: fs->numericFactFlops = flops;
1929: }
1930: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1931: PetscFunctionReturn(PETSC_SUCCESS);
1932: }
1934: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1935: {
1936: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1937: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1938: const PetscScalar *barray;
1939: PetscScalar *xarray;
1941: PetscFunctionBegin;
1942: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1943: PetscCall(VecCUDAGetArrayRead(b, &barray));
1944: PetscCall(PetscLogGpuTimeBegin());
1946: /* Solve L*y = b */
1947: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1948: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1949: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1950: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1952: /* Solve Lt*x = y */
1953: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1954: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1955: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1957: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1958: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1960: PetscCall(PetscLogGpuTimeEnd());
1961: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1962: PetscFunctionReturn(PETSC_SUCCESS);
1963: }
1965: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1966: {
1967: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1968: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1969: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1970: CsrMatrix *Acsr;
1971: PetscInt m, nz;
1972: PetscBool flg;
1974: PetscFunctionBegin;
1975: if (PetscDefined(USE_DEBUG)) {
1976: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1977: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1978: }
1980: /* Copy A's value to fact */
1981: m = fact->rmap->n;
1982: nz = aij->nz;
1983: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1984: Acsr = (CsrMatrix *)Acusp->mat->mat;
1985: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1987: /* Factorize fact inplace */
1988: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1989: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1990: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1991: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1992: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1993: */
1994: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1995: if (PetscDefined(USE_DEBUG)) {
1996: int numerical_zero;
1997: cusparseStatus_t status;
1998: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1999: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2000: }
2002: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2003: if (fs->updatedSpSVAnalysis) {
2004: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2005: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006: } else
2007: #endif
2008: {
2009: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2011: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2012: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2013: */
2014: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2015: fs->updatedSpSVAnalysis = PETSC_TRUE;
2016: }
2018: fact->offloadmask = PETSC_OFFLOAD_GPU;
2019: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
2020: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2021: fact->ops->matsolve = NULL;
2022: fact->ops->matsolvetranspose = NULL;
2023: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2024: PetscFunctionReturn(PETSC_SUCCESS);
2025: }
2027: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2028: {
2029: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2030: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2031: PetscInt m, nz;
2033: PetscFunctionBegin;
2034: if (PetscDefined(USE_DEBUG)) {
2035: PetscInt i;
2036: PetscBool flg, missing;
2038: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2039: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2040: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2041: PetscCall(MatMissingDiagonal(A, &missing, &i));
2042: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2043: }
2045: /* Free the old stale stuff */
2046: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2048: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2049: but they will not be used. Allocate them just for easy debugging.
2050: */
2051: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2053: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2054: fact->factortype = MAT_FACTOR_ICC;
2055: fact->info.factor_mallocs = 0;
2056: fact->info.fill_ratio_given = info->fill;
2057: fact->info.fill_ratio_needed = 1.0;
2059: aij->row = NULL;
2060: aij->col = NULL;
2062: /* ====================================================================== */
2063: /* Copy A's i, j to fact and also allocate the value array of fact. */
2064: /* We'll do in-place factorization on fact */
2065: /* ====================================================================== */
2066: const int *Ai, *Aj;
2068: m = fact->rmap->n;
2069: nz = aij->nz;
2071: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2072: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2073: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2074: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2075: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2076: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078: /* ====================================================================== */
2079: /* Create mat descriptors for M, L */
2080: /* ====================================================================== */
2081: cusparseFillMode_t fillMode;
2082: cusparseDiagType_t diagType;
2084: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2085: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2086: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2088: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2089: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2090: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2091: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2092: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2093: */
2094: fillMode = CUSPARSE_FILL_MODE_LOWER;
2095: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2096: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2097: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2098: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2100: /* ========================================================================= */
2101: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2102: /* ========================================================================= */
2103: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2104: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2106: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2107: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2109: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2110: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2112: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2113: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2115: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2116: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2118: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2119: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2120: */
2121: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2122: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2123: fs->spsvBuffer_L = fs->factBuffer_M;
2124: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2125: } else {
2126: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2127: fs->spsvBuffer_Lt = fs->factBuffer_M;
2128: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2129: }
2131: /* ========================================================================== */
2132: /* Perform analysis of ic0 on M */
2133: /* The lower triangular part of M has the same sparsity pattern as L */
2134: /* ========================================================================== */
2135: int structural_zero;
2136: cusparseStatus_t status;
2138: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2139: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2140: if (PetscDefined(USE_DEBUG)) {
2141: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2142: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2143: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2144: }
2146: /* Estimate FLOPs of the numeric factorization */
2147: {
2148: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2149: PetscInt *Ai, nzRow, nzLeft;
2150: PetscLogDouble flops = 0.0;
2152: Ai = Aseq->i;
2153: for (PetscInt i = 0; i < m; i++) {
2154: nzRow = Ai[i + 1] - Ai[i];
2155: if (nzRow > 1) {
2156: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2157: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2158: */
2159: nzLeft = (nzRow - 1) / 2;
2160: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2161: }
2162: }
2163: fs->numericFactFlops = flops;
2164: }
2165: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2166: PetscFunctionReturn(PETSC_SUCCESS);
2167: }
2168: #endif
2170: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2171: {
2172: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2173: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2175: PetscFunctionBegin;
2176: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2177: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2178: B->offloadmask = PETSC_OFFLOAD_CPU;
2180: if (!cusparsestruct->use_cpu_solve) {
2181: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2182: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2183: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2184: #else
2185: /* determine which version of MatSolve needs to be used. */
2186: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2187: IS isrow = b->row, iscol = b->col;
2188: PetscBool row_identity, col_identity;
2190: PetscCall(ISIdentity(isrow, &row_identity));
2191: PetscCall(ISIdentity(iscol, &col_identity));
2192: if (row_identity && col_identity) {
2193: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2194: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2195: } else {
2196: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2197: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2198: }
2199: #endif
2200: }
2201: B->ops->matsolve = NULL;
2202: B->ops->matsolvetranspose = NULL;
2204: /* get the triangular factors */
2205: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2206: PetscFunctionReturn(PETSC_SUCCESS);
2207: }
2209: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2210: {
2211: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2213: PetscFunctionBegin;
2214: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2215: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2216: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2217: PetscFunctionReturn(PETSC_SUCCESS);
2218: }
2220: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2221: {
2222: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2224: PetscFunctionBegin;
2225: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2226: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2227: if (!info->factoronhost) {
2228: PetscCall(ISIdentity(isrow, &row_identity));
2229: PetscCall(ISIdentity(iscol, &col_identity));
2230: }
2231: if (!info->levels && row_identity && col_identity) {
2232: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2233: } else
2234: #endif
2235: {
2236: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2238: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2239: }
2240: PetscFunctionReturn(PETSC_SUCCESS);
2241: }
2243: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244: {
2245: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2247: PetscFunctionBegin;
2248: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2249: PetscBool perm_identity = PETSC_FALSE;
2250: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2251: if (!info->levels && perm_identity) {
2252: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2253: } else
2254: #endif
2255: {
2256: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2257: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2258: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2259: }
2260: PetscFunctionReturn(PETSC_SUCCESS);
2261: }
2263: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2264: {
2265: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2267: PetscFunctionBegin;
2268: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2269: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2270: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2271: PetscFunctionReturn(PETSC_SUCCESS);
2272: }
2274: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2275: {
2276: PetscFunctionBegin;
2277: *type = MATSOLVERCUSPARSE;
2278: PetscFunctionReturn(PETSC_SUCCESS);
2279: }
2281: /*MC
2282: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2283: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2284: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2285: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2286: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2287: algorithms are not recommended. This class does NOT support direct solver operations.
2289: Level: beginner
2291: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2292: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2293: M*/
2295: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2296: {
2297: PetscInt n = A->rmap->n;
2299: PetscFunctionBegin;
2300: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301: PetscCall(MatSetSizes(*B, n, n, n, n));
2302: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2303: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2305: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2306: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2307: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2308: if (!A->boundtocpu) {
2309: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2310: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2311: } else {
2312: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2313: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2314: }
2315: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2316: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2317: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2318: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2319: if (!A->boundtocpu) {
2320: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2321: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2322: } else {
2323: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2324: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2325: }
2326: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2327: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2328: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2330: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2331: (*B)->canuseordering = PETSC_TRUE;
2332: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2333: PetscFunctionReturn(PETSC_SUCCESS);
2334: }
2336: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2337: {
2338: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2339: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2340: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2341: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2342: #endif
2344: PetscFunctionBegin;
2345: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2346: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2347: if (A->factortype == MAT_FACTOR_NONE) {
2348: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2349: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2350: }
2351: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2352: else if (fs->csrVal) {
2353: /* We have a factorized matrix on device and are able to copy it to host */
2354: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2355: }
2356: #endif
2357: else
2358: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2359: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2360: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2361: A->offloadmask = PETSC_OFFLOAD_BOTH;
2362: }
2363: PetscFunctionReturn(PETSC_SUCCESS);
2364: }
2366: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2367: {
2368: PetscFunctionBegin;
2369: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2370: *array = ((Mat_SeqAIJ *)A->data)->a;
2371: PetscFunctionReturn(PETSC_SUCCESS);
2372: }
2374: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375: {
2376: PetscFunctionBegin;
2377: A->offloadmask = PETSC_OFFLOAD_CPU;
2378: *array = NULL;
2379: PetscFunctionReturn(PETSC_SUCCESS);
2380: }
2382: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2383: {
2384: PetscFunctionBegin;
2385: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2386: *array = ((Mat_SeqAIJ *)A->data)->a;
2387: PetscFunctionReturn(PETSC_SUCCESS);
2388: }
2390: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2391: {
2392: PetscFunctionBegin;
2393: *array = NULL;
2394: PetscFunctionReturn(PETSC_SUCCESS);
2395: }
2397: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2398: {
2399: PetscFunctionBegin;
2400: *array = ((Mat_SeqAIJ *)A->data)->a;
2401: PetscFunctionReturn(PETSC_SUCCESS);
2402: }
2404: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2405: {
2406: PetscFunctionBegin;
2407: A->offloadmask = PETSC_OFFLOAD_CPU;
2408: *array = NULL;
2409: PetscFunctionReturn(PETSC_SUCCESS);
2410: }
2412: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2413: {
2414: Mat_SeqAIJCUSPARSE *cusp;
2415: CsrMatrix *matrix;
2417: PetscFunctionBegin;
2418: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2419: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2420: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2421: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2422: matrix = (CsrMatrix *)cusp->mat->mat;
2424: if (i) {
2425: #if !defined(PETSC_USE_64BIT_INDICES)
2426: *i = matrix->row_offsets->data().get();
2427: #else
2428: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2429: #endif
2430: }
2431: if (j) {
2432: #if !defined(PETSC_USE_64BIT_INDICES)
2433: *j = matrix->column_indices->data().get();
2434: #else
2435: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2436: #endif
2437: }
2438: if (a) *a = matrix->values->data().get();
2439: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2440: PetscFunctionReturn(PETSC_SUCCESS);
2441: }
2443: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2444: {
2445: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2446: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2447: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2448: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2449: cusparseStatus_t stat;
2450: PetscBool both = PETSC_TRUE;
2452: PetscFunctionBegin;
2453: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2454: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2455: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2456: CsrMatrix *matrix;
2457: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2459: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2460: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2461: matrix->values->assign(a->a, a->a + a->nz);
2462: PetscCallCUDA(WaitForCUDA());
2463: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2464: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2465: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2466: } else {
2467: PetscInt nnz;
2468: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2469: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2470: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2471: delete cusparsestruct->workVector;
2472: delete cusparsestruct->rowoffsets_gpu;
2473: cusparsestruct->workVector = NULL;
2474: cusparsestruct->rowoffsets_gpu = NULL;
2475: try {
2476: if (a->compressedrow.use) {
2477: m = a->compressedrow.nrows;
2478: ii = a->compressedrow.i;
2479: ridx = a->compressedrow.rindex;
2480: } else {
2481: m = A->rmap->n;
2482: ii = a->i;
2483: ridx = NULL;
2484: }
2485: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2486: if (!a->a) {
2487: nnz = ii[m];
2488: both = PETSC_FALSE;
2489: } else nnz = a->nz;
2490: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2492: /* create cusparse matrix */
2493: cusparsestruct->nrows = m;
2494: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2495: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2496: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2497: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2499: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2500: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2501: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2502: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2503: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2504: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2507: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2508: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2509: /* set the matrix */
2510: CsrMatrix *mat = new CsrMatrix;
2511: mat->num_rows = m;
2512: mat->num_cols = A->cmap->n;
2513: mat->num_entries = nnz;
2514: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515: mat->row_offsets->assign(ii, ii + m + 1);
2517: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518: mat->column_indices->assign(a->j, a->j + nnz);
2520: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521: if (a->a) mat->values->assign(a->a, a->a + nnz);
2523: /* assign the pointer */
2524: matstruct->mat = mat;
2525: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2526: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2527: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2528: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2529: PetscCallCUSPARSE(stat);
2530: }
2531: #endif
2532: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2533: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2534: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2535: #else
2536: CsrMatrix *mat = new CsrMatrix;
2537: mat->num_rows = m;
2538: mat->num_cols = A->cmap->n;
2539: mat->num_entries = nnz;
2540: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2541: mat->row_offsets->assign(ii, ii + m + 1);
2543: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2544: mat->column_indices->assign(a->j, a->j + nnz);
2546: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2547: if (a->a) mat->values->assign(a->a, a->a + nnz);
2549: cusparseHybMat_t hybMat;
2550: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2551: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2552: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2553: PetscCallCUSPARSE(stat);
2554: /* assign the pointer */
2555: matstruct->mat = hybMat;
2557: if (mat) {
2558: if (mat->values) delete (THRUSTARRAY *)mat->values;
2559: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2560: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2561: delete (CsrMatrix *)mat;
2562: }
2563: #endif
2564: }
2566: /* assign the compressed row indices */
2567: if (a->compressedrow.use) {
2568: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2569: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2570: matstruct->cprowIndices->assign(ridx, ridx + m);
2571: tmp = m;
2572: } else {
2573: cusparsestruct->workVector = NULL;
2574: matstruct->cprowIndices = NULL;
2575: tmp = 0;
2576: }
2577: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2579: /* assign the pointer */
2580: cusparsestruct->mat = matstruct;
2581: } catch (char *ex) {
2582: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2583: }
2584: PetscCallCUDA(WaitForCUDA());
2585: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2586: cusparsestruct->nonzerostate = A->nonzerostate;
2587: }
2588: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2589: }
2590: PetscFunctionReturn(PETSC_SUCCESS);
2591: }
2593: struct VecCUDAPlusEquals {
2594: template <typename Tuple>
2595: __host__ __device__ void operator()(Tuple t)
2596: {
2597: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2598: }
2599: };
2601: struct VecCUDAEquals {
2602: template <typename Tuple>
2603: __host__ __device__ void operator()(Tuple t)
2604: {
2605: thrust::get<1>(t) = thrust::get<0>(t);
2606: }
2607: };
2609: struct VecCUDAEqualsReverse {
2610: template <typename Tuple>
2611: __host__ __device__ void operator()(Tuple t)
2612: {
2613: thrust::get<0>(t) = thrust::get<1>(t);
2614: }
2615: };
2617: struct MatMatCusparse {
2618: PetscBool cisdense;
2619: PetscScalar *Bt;
2620: Mat X;
2621: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2622: PetscLogDouble flops;
2623: CsrMatrix *Bcsr;
2625: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626: cusparseSpMatDescr_t matSpBDescr;
2627: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2628: cusparseDnMatDescr_t matBDescr;
2629: cusparseDnMatDescr_t matCDescr;
2630: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2631: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632: void *dBuffer4;
2633: void *dBuffer5;
2634: #endif
2635: size_t mmBufferSize;
2636: void *mmBuffer;
2637: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2638: cusparseSpGEMMDescr_t spgemmDesc;
2639: #endif
2640: };
2642: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2643: {
2644: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2646: PetscFunctionBegin;
2647: PetscCallCUDA(cudaFree(mmdata->Bt));
2648: delete mmdata->Bcsr;
2649: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2650: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2651: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2652: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2653: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2654: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2655: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2656: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2657: #endif
2658: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2659: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2660: #endif
2661: PetscCall(MatDestroy(&mmdata->X));
2662: PetscCall(PetscFree(data));
2663: PetscFunctionReturn(PETSC_SUCCESS);
2664: }
2666: #include <../src/mat/impls/dense/seq/dense.h>
2668: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2669: {
2670: Mat_Product *product = C->product;
2671: Mat A, B;
2672: PetscInt m, n, blda, clda;
2673: PetscBool flg, biscuda;
2674: Mat_SeqAIJCUSPARSE *cusp;
2675: cusparseStatus_t stat;
2676: cusparseOperation_t opA;
2677: const PetscScalar *barray;
2678: PetscScalar *carray;
2679: MatMatCusparse *mmdata;
2680: Mat_SeqAIJCUSPARSEMultStruct *mat;
2681: CsrMatrix *csrmat;
2683: PetscFunctionBegin;
2684: MatCheckProduct(C, 1);
2685: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2686: mmdata = (MatMatCusparse *)product->data;
2687: A = product->A;
2688: B = product->B;
2689: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2690: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2691: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2692: Instead of silently accepting the wrong answer, I prefer to raise the error */
2693: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2694: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2695: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2696: switch (product->type) {
2697: case MATPRODUCT_AB:
2698: case MATPRODUCT_PtAP:
2699: mat = cusp->mat;
2700: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2701: m = A->rmap->n;
2702: n = B->cmap->n;
2703: break;
2704: case MATPRODUCT_AtB:
2705: if (!A->form_explicit_transpose) {
2706: mat = cusp->mat;
2707: opA = CUSPARSE_OPERATION_TRANSPOSE;
2708: } else {
2709: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2710: mat = cusp->matTranspose;
2711: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2712: }
2713: m = A->cmap->n;
2714: n = B->cmap->n;
2715: break;
2716: case MATPRODUCT_ABt:
2717: case MATPRODUCT_RARt:
2718: mat = cusp->mat;
2719: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2720: m = A->rmap->n;
2721: n = B->rmap->n;
2722: break;
2723: default:
2724: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2725: }
2726: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2727: csrmat = (CsrMatrix *)mat->mat;
2728: /* if the user passed a CPU matrix, copy the data to the GPU */
2729: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2730: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2731: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2733: PetscCall(MatDenseGetLDA(B, &blda));
2734: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2735: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2736: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2737: } else {
2738: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2739: PetscCall(MatDenseGetLDA(C, &clda));
2740: }
2742: PetscCall(PetscLogGpuTimeBegin());
2743: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2744: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2745: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2746: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2747: #else
2748: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2749: #endif
2751: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2752: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2753: size_t mmBufferSize;
2754: if (mmdata->initialized && mmdata->Blda != blda) {
2755: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2756: mmdata->matBDescr = NULL;
2757: }
2758: if (!mmdata->matBDescr) {
2759: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2760: mmdata->Blda = blda;
2761: }
2763: if (mmdata->initialized && mmdata->Clda != clda) {
2764: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2765: mmdata->matCDescr = NULL;
2766: }
2767: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2768: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2769: mmdata->Clda = clda;
2770: }
2772: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2773: if (matADescr) {
2774: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2775: matADescr = NULL;
2776: }
2777: #endif
2779: if (!matADescr) {
2780: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2781: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2782: PetscCallCUSPARSE(stat);
2783: }
2785: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2787: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2788: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2789: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2790: mmdata->mmBufferSize = mmBufferSize;
2791: }
2793: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2794: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2795: #endif
2797: mmdata->initialized = PETSC_TRUE;
2798: } else {
2799: /* to be safe, always update pointers of the mats */
2800: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2801: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2802: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2803: }
2805: /* do cusparseSpMM, which supports transpose on B */
2806: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2807: #else
2808: PetscInt k;
2809: /* cusparseXcsrmm does not support transpose on B */
2810: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2811: cublasHandle_t cublasv2handle;
2812: cublasStatus_t cerr;
2814: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2815: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2816: PetscCallCUBLAS(cerr);
2817: blda = B->cmap->n;
2818: k = B->cmap->n;
2819: } else {
2820: k = B->rmap->n;
2821: }
2823: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2824: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2825: PetscCallCUSPARSE(stat);
2826: #endif
2827: PetscCall(PetscLogGpuTimeEnd());
2828: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2829: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2830: if (product->type == MATPRODUCT_RARt) {
2831: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2832: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2833: } else if (product->type == MATPRODUCT_PtAP) {
2834: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2835: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2836: } else {
2837: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2838: }
2839: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2840: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2841: PetscFunctionReturn(PETSC_SUCCESS);
2842: }
2844: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2845: {
2846: Mat_Product *product = C->product;
2847: Mat A, B;
2848: PetscInt m, n;
2849: PetscBool cisdense, flg;
2850: MatMatCusparse *mmdata;
2851: Mat_SeqAIJCUSPARSE *cusp;
2853: PetscFunctionBegin;
2854: MatCheckProduct(C, 1);
2855: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2856: A = product->A;
2857: B = product->B;
2858: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2859: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2860: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2861: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2862: switch (product->type) {
2863: case MATPRODUCT_AB:
2864: m = A->rmap->n;
2865: n = B->cmap->n;
2866: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2867: break;
2868: case MATPRODUCT_AtB:
2869: m = A->cmap->n;
2870: n = B->cmap->n;
2871: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2872: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2873: break;
2874: case MATPRODUCT_ABt:
2875: m = A->rmap->n;
2876: n = B->rmap->n;
2877: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2878: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2879: break;
2880: case MATPRODUCT_PtAP:
2881: m = B->cmap->n;
2882: n = B->cmap->n;
2883: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2884: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2885: break;
2886: case MATPRODUCT_RARt:
2887: m = B->rmap->n;
2888: n = B->rmap->n;
2889: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2890: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2891: break;
2892: default:
2893: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2894: }
2895: PetscCall(MatSetSizes(C, m, n, m, n));
2896: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2897: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2898: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2900: /* product data */
2901: PetscCall(PetscNew(&mmdata));
2902: mmdata->cisdense = cisdense;
2903: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2904: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2905: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2906: #endif
2907: /* for these products we need intermediate storage */
2908: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2909: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2910: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2911: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2912: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2913: } else {
2914: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2915: }
2916: }
2917: C->product->data = mmdata;
2918: C->product->destroy = MatDestroy_MatMatCusparse;
2920: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2921: PetscFunctionReturn(PETSC_SUCCESS);
2922: }
2924: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2925: {
2926: Mat_Product *product = C->product;
2927: Mat A, B;
2928: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2929: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2930: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2931: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2932: PetscBool flg;
2933: cusparseStatus_t stat;
2934: MatProductType ptype;
2935: MatMatCusparse *mmdata;
2936: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2937: cusparseSpMatDescr_t BmatSpDescr;
2938: #endif
2939: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2941: PetscFunctionBegin;
2942: MatCheckProduct(C, 1);
2943: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2944: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2945: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2946: mmdata = (MatMatCusparse *)C->product->data;
2947: A = product->A;
2948: B = product->B;
2949: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2950: mmdata->reusesym = PETSC_FALSE;
2951: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2952: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2953: Cmat = Ccusp->mat;
2954: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2955: Ccsr = (CsrMatrix *)Cmat->mat;
2956: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957: goto finalize;
2958: }
2959: if (!c->nz) goto finalize;
2960: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2961: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2962: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2963: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2964: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2965: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2967: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2968: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2969: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2970: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2971: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2973: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2975: ptype = product->type;
2976: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2977: ptype = MATPRODUCT_AB;
2978: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2979: }
2980: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2981: ptype = MATPRODUCT_AB;
2982: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2983: }
2984: switch (ptype) {
2985: case MATPRODUCT_AB:
2986: Amat = Acusp->mat;
2987: Bmat = Bcusp->mat;
2988: break;
2989: case MATPRODUCT_AtB:
2990: Amat = Acusp->matTranspose;
2991: Bmat = Bcusp->mat;
2992: break;
2993: case MATPRODUCT_ABt:
2994: Amat = Acusp->mat;
2995: Bmat = Bcusp->matTranspose;
2996: break;
2997: default:
2998: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2999: }
3000: Cmat = Ccusp->mat;
3001: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3002: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3003: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3004: Acsr = (CsrMatrix *)Amat->mat;
3005: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3006: Ccsr = (CsrMatrix *)Cmat->mat;
3007: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3008: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3009: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3010: PetscCall(PetscLogGpuTimeBegin());
3011: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3012: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3013: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3014: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3015: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3016: PetscCallCUSPARSE(stat);
3017: #else
3018: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3019: PetscCallCUSPARSE(stat);
3020: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3021: PetscCallCUSPARSE(stat);
3022: #endif
3023: #else
3024: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3025: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3026: PetscCallCUSPARSE(stat);
3027: #endif
3028: PetscCall(PetscLogGpuFlops(mmdata->flops));
3029: PetscCallCUDA(WaitForCUDA());
3030: PetscCall(PetscLogGpuTimeEnd());
3031: C->offloadmask = PETSC_OFFLOAD_GPU;
3032: finalize:
3033: /* shorter version of MatAssemblyEnd_SeqAIJ */
3034: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3035: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3036: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3037: c->reallocs = 0;
3038: C->info.mallocs += 0;
3039: C->info.nz_unneeded = 0;
3040: C->assembled = C->was_assembled = PETSC_TRUE;
3041: C->num_ass++;
3042: PetscFunctionReturn(PETSC_SUCCESS);
3043: }
3045: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3046: {
3047: Mat_Product *product = C->product;
3048: Mat A, B;
3049: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3050: Mat_SeqAIJ *a, *b, *c;
3051: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3052: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3053: PetscInt i, j, m, n, k;
3054: PetscBool flg;
3055: cusparseStatus_t stat;
3056: MatProductType ptype;
3057: MatMatCusparse *mmdata;
3058: PetscLogDouble flops;
3059: PetscBool biscompressed, ciscompressed;
3060: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3061: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3062: cusparseSpMatDescr_t BmatSpDescr;
3063: #else
3064: int cnz;
3065: #endif
3066: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3068: PetscFunctionBegin;
3069: MatCheckProduct(C, 1);
3070: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3071: A = product->A;
3072: B = product->B;
3073: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3074: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3075: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3076: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3077: a = (Mat_SeqAIJ *)A->data;
3078: b = (Mat_SeqAIJ *)B->data;
3079: /* product data */
3080: PetscCall(PetscNew(&mmdata));
3081: C->product->data = mmdata;
3082: C->product->destroy = MatDestroy_MatMatCusparse;
3084: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3085: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3086: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3087: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3088: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3089: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3091: ptype = product->type;
3092: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3093: ptype = MATPRODUCT_AB;
3094: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3095: }
3096: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3097: ptype = MATPRODUCT_AB;
3098: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3099: }
3100: biscompressed = PETSC_FALSE;
3101: ciscompressed = PETSC_FALSE;
3102: switch (ptype) {
3103: case MATPRODUCT_AB:
3104: m = A->rmap->n;
3105: n = B->cmap->n;
3106: k = A->cmap->n;
3107: Amat = Acusp->mat;
3108: Bmat = Bcusp->mat;
3109: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3110: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3111: break;
3112: case MATPRODUCT_AtB:
3113: m = A->cmap->n;
3114: n = B->cmap->n;
3115: k = A->rmap->n;
3116: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3117: Amat = Acusp->matTranspose;
3118: Bmat = Bcusp->mat;
3119: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3120: break;
3121: case MATPRODUCT_ABt:
3122: m = A->rmap->n;
3123: n = B->rmap->n;
3124: k = A->cmap->n;
3125: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3126: Amat = Acusp->mat;
3127: Bmat = Bcusp->matTranspose;
3128: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3129: break;
3130: default:
3131: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3132: }
3134: /* create cusparse matrix */
3135: PetscCall(MatSetSizes(C, m, n, m, n));
3136: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3137: c = (Mat_SeqAIJ *)C->data;
3138: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3139: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3140: Ccsr = new CsrMatrix;
3142: c->compressedrow.use = ciscompressed;
3143: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3144: c->compressedrow.nrows = a->compressedrow.nrows;
3145: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3146: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3147: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3148: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3149: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3150: } else {
3151: c->compressedrow.nrows = 0;
3152: c->compressedrow.i = NULL;
3153: c->compressedrow.rindex = NULL;
3154: Ccusp->workVector = NULL;
3155: Cmat->cprowIndices = NULL;
3156: }
3157: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3158: Ccusp->mat = Cmat;
3159: Ccusp->mat->mat = Ccsr;
3160: Ccsr->num_rows = Ccusp->nrows;
3161: Ccsr->num_cols = n;
3162: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3163: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3164: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3165: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3166: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3167: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3168: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3169: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3170: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3171: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3173: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3174: c->nz = 0;
3175: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3176: Ccsr->values = new THRUSTARRAY(c->nz);
3177: goto finalizesym;
3178: }
3180: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3181: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3182: Acsr = (CsrMatrix *)Amat->mat;
3183: if (!biscompressed) {
3184: Bcsr = (CsrMatrix *)Bmat->mat;
3185: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3186: BmatSpDescr = Bmat->matDescr;
3187: #endif
3188: } else { /* we need to use row offsets for the full matrix */
3189: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3190: Bcsr = new CsrMatrix;
3191: Bcsr->num_rows = B->rmap->n;
3192: Bcsr->num_cols = cBcsr->num_cols;
3193: Bcsr->num_entries = cBcsr->num_entries;
3194: Bcsr->column_indices = cBcsr->column_indices;
3195: Bcsr->values = cBcsr->values;
3196: if (!Bcusp->rowoffsets_gpu) {
3197: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3198: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3199: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3200: }
3201: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3202: mmdata->Bcsr = Bcsr;
3203: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3204: if (Bcsr->num_rows && Bcsr->num_cols) {
3205: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3206: PetscCallCUSPARSE(stat);
3207: }
3208: BmatSpDescr = mmdata->matSpBDescr;
3209: #endif
3210: }
3211: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3212: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3213: /* precompute flops count */
3214: if (ptype == MATPRODUCT_AB) {
3215: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3216: const PetscInt st = a->i[i];
3217: const PetscInt en = a->i[i + 1];
3218: for (j = st; j < en; j++) {
3219: const PetscInt brow = a->j[j];
3220: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3221: }
3222: }
3223: } else if (ptype == MATPRODUCT_AtB) {
3224: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3225: const PetscInt anzi = a->i[i + 1] - a->i[i];
3226: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3227: flops += (2. * anzi) * bnzi;
3228: }
3229: } else { /* TODO */
3230: flops = 0.;
3231: }
3233: mmdata->flops = flops;
3234: PetscCall(PetscLogGpuTimeBegin());
3236: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3237: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3238: // cuda-12.2 requires non-null csrRowOffsets
3239: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3240: PetscCallCUSPARSE(stat);
3241: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3242: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3243: {
3244: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3245: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3246: */
3247: void *dBuffer1 = NULL;
3248: void *dBuffer2 = NULL;
3249: void *dBuffer3 = NULL;
3250: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3251: size_t bufferSize1 = 0;
3252: size_t bufferSize2 = 0;
3253: size_t bufferSize3 = 0;
3254: size_t bufferSize4 = 0;
3255: size_t bufferSize5 = 0;
3257: /* ask bufferSize1 bytes for external memory */
3258: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3259: PetscCallCUSPARSE(stat);
3260: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3261: /* inspect the matrices A and B to understand the memory requirement for the next step */
3262: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3263: PetscCallCUSPARSE(stat);
3265: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3266: PetscCallCUSPARSE(stat);
3267: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3268: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3269: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3270: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3271: PetscCallCUSPARSE(stat);
3272: PetscCallCUDA(cudaFree(dBuffer1));
3273: PetscCallCUDA(cudaFree(dBuffer2));
3275: /* get matrix C non-zero entries C_nnz1 */
3276: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3277: c->nz = (PetscInt)C_nnz1;
3278: /* allocate matrix C */
3279: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3280: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3281: Ccsr->values = new THRUSTARRAY(c->nz);
3282: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283: /* update matC with the new pointers */
3284: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3285: PetscCallCUSPARSE(stat);
3287: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3288: PetscCallCUSPARSE(stat);
3289: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3290: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3291: PetscCallCUSPARSE(stat);
3292: PetscCallCUDA(cudaFree(dBuffer3));
3293: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3294: PetscCallCUSPARSE(stat);
3295: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3296: }
3297: #else
3298: size_t bufSize2;
3299: /* ask bufferSize bytes for external memory */
3300: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3301: PetscCallCUSPARSE(stat);
3302: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3303: /* inspect the matrices A and B to understand the memory requirement for the next step */
3304: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3305: PetscCallCUSPARSE(stat);
3306: /* ask bufferSize again bytes for external memory */
3307: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3308: PetscCallCUSPARSE(stat);
3309: /* The CUSPARSE documentation is not clear, nor the API
3310: We need both buffers to perform the operations properly!
3311: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3312: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3313: is stored in the descriptor! What a messy API... */
3314: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3315: /* compute the intermediate product of A * B */
3316: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3317: PetscCallCUSPARSE(stat);
3318: /* get matrix C non-zero entries C_nnz1 */
3319: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3320: c->nz = (PetscInt)C_nnz1;
3321: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3322: mmdata->mmBufferSize / 1024));
3323: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3324: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3325: Ccsr->values = new THRUSTARRAY(c->nz);
3326: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3328: PetscCallCUSPARSE(stat);
3329: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3330: PetscCallCUSPARSE(stat);
3331: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3332: #else
3333: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3334: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3335: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3336: PetscCallCUSPARSE(stat);
3337: c->nz = cnz;
3338: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3339: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3340: Ccsr->values = new THRUSTARRAY(c->nz);
3341: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3343: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3344: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3345: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3346: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3347: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3348: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3349: PetscCallCUSPARSE(stat);
3350: #endif
3351: PetscCall(PetscLogGpuFlops(mmdata->flops));
3352: PetscCall(PetscLogGpuTimeEnd());
3353: finalizesym:
3354: c->free_a = PETSC_TRUE;
3355: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3356: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3357: c->free_ij = PETSC_TRUE;
3358: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3359: PetscInt *d_i = c->i;
3360: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3361: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3362: ii = *Ccsr->row_offsets;
3363: jj = *Ccsr->column_indices;
3364: if (ciscompressed) d_i = c->compressedrow.i;
3365: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3366: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367: } else {
3368: PetscInt *d_i = c->i;
3369: if (ciscompressed) d_i = c->compressedrow.i;
3370: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3371: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372: }
3373: if (ciscompressed) { /* need to expand host row offsets */
3374: PetscInt r = 0;
3375: c->i[0] = 0;
3376: for (k = 0; k < c->compressedrow.nrows; k++) {
3377: const PetscInt next = c->compressedrow.rindex[k];
3378: const PetscInt old = c->compressedrow.i[k];
3379: for (; r < next; r++) c->i[r + 1] = old;
3380: }
3381: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3382: }
3383: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3384: PetscCall(PetscMalloc1(m, &c->ilen));
3385: PetscCall(PetscMalloc1(m, &c->imax));
3386: c->maxnz = c->nz;
3387: c->nonzerorowcnt = 0;
3388: c->rmax = 0;
3389: for (k = 0; k < m; k++) {
3390: const PetscInt nn = c->i[k + 1] - c->i[k];
3391: c->ilen[k] = c->imax[k] = nn;
3392: c->nonzerorowcnt += (PetscInt)!!nn;
3393: c->rmax = PetscMax(c->rmax, nn);
3394: }
3395: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3396: PetscCall(PetscMalloc1(c->nz, &c->a));
3397: Ccsr->num_entries = c->nz;
3399: C->nonzerostate++;
3400: PetscCall(PetscLayoutSetUp(C->rmap));
3401: PetscCall(PetscLayoutSetUp(C->cmap));
3402: Ccusp->nonzerostate = C->nonzerostate;
3403: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3404: C->preallocated = PETSC_TRUE;
3405: C->assembled = PETSC_FALSE;
3406: C->was_assembled = PETSC_FALSE;
3407: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3408: mmdata->reusesym = PETSC_TRUE;
3409: C->offloadmask = PETSC_OFFLOAD_GPU;
3410: }
3411: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3412: PetscFunctionReturn(PETSC_SUCCESS);
3413: }
3415: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3417: /* handles sparse or dense B */
3418: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3419: {
3420: Mat_Product *product = mat->product;
3421: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3423: PetscFunctionBegin;
3424: MatCheckProduct(mat, 1);
3425: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3426: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3427: if (product->type == MATPRODUCT_ABC) {
3428: Ciscusp = PETSC_FALSE;
3429: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3430: }
3431: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3432: PetscBool usecpu = PETSC_FALSE;
3433: switch (product->type) {
3434: case MATPRODUCT_AB:
3435: if (product->api_user) {
3436: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3437: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3438: PetscOptionsEnd();
3439: } else {
3440: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3441: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3442: PetscOptionsEnd();
3443: }
3444: break;
3445: case MATPRODUCT_AtB:
3446: if (product->api_user) {
3447: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3448: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3449: PetscOptionsEnd();
3450: } else {
3451: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3452: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3453: PetscOptionsEnd();
3454: }
3455: break;
3456: case MATPRODUCT_PtAP:
3457: if (product->api_user) {
3458: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3459: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3460: PetscOptionsEnd();
3461: } else {
3462: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3463: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3464: PetscOptionsEnd();
3465: }
3466: break;
3467: case MATPRODUCT_RARt:
3468: if (product->api_user) {
3469: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3470: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3471: PetscOptionsEnd();
3472: } else {
3473: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3474: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3475: PetscOptionsEnd();
3476: }
3477: break;
3478: case MATPRODUCT_ABC:
3479: if (product->api_user) {
3480: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3481: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3482: PetscOptionsEnd();
3483: } else {
3484: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3485: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3486: PetscOptionsEnd();
3487: }
3488: break;
3489: default:
3490: break;
3491: }
3492: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3493: }
3494: /* dispatch */
3495: if (isdense) {
3496: switch (product->type) {
3497: case MATPRODUCT_AB:
3498: case MATPRODUCT_AtB:
3499: case MATPRODUCT_ABt:
3500: case MATPRODUCT_PtAP:
3501: case MATPRODUCT_RARt:
3502: if (product->A->boundtocpu) {
3503: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3504: } else {
3505: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3506: }
3507: break;
3508: case MATPRODUCT_ABC:
3509: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3510: break;
3511: default:
3512: break;
3513: }
3514: } else if (Biscusp && Ciscusp) {
3515: switch (product->type) {
3516: case MATPRODUCT_AB:
3517: case MATPRODUCT_AtB:
3518: case MATPRODUCT_ABt:
3519: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3520: break;
3521: case MATPRODUCT_PtAP:
3522: case MATPRODUCT_RARt:
3523: case MATPRODUCT_ABC:
3524: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3525: break;
3526: default:
3527: break;
3528: }
3529: } else { /* fallback for AIJ */
3530: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3531: }
3532: PetscFunctionReturn(PETSC_SUCCESS);
3533: }
3535: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3536: {
3537: PetscFunctionBegin;
3538: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3539: PetscFunctionReturn(PETSC_SUCCESS);
3540: }
3542: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3543: {
3544: PetscFunctionBegin;
3545: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3546: PetscFunctionReturn(PETSC_SUCCESS);
3547: }
3549: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3550: {
3551: PetscFunctionBegin;
3552: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3553: PetscFunctionReturn(PETSC_SUCCESS);
3554: }
3556: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3557: {
3558: PetscFunctionBegin;
3559: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3560: PetscFunctionReturn(PETSC_SUCCESS);
3561: }
3563: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3564: {
3565: PetscFunctionBegin;
3566: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3567: PetscFunctionReturn(PETSC_SUCCESS);
3568: }
3570: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3571: {
3572: int i = blockIdx.x * blockDim.x + threadIdx.x;
3573: if (i < n) y[idx[i]] += x[i];
3574: }
3576: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3577: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3578: {
3579: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3580: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3581: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3582: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3583: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3584: PetscBool compressed;
3585: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3586: PetscInt nx, ny;
3587: #endif
3589: PetscFunctionBegin;
3590: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3591: if (!a->nz) {
3592: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3593: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3594: PetscFunctionReturn(PETSC_SUCCESS);
3595: }
3596: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3597: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3598: if (!trans) {
3599: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3600: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3601: } else {
3602: if (herm || !A->form_explicit_transpose) {
3603: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3604: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3605: } else {
3606: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3607: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3608: }
3609: }
3610: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3611: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3613: try {
3614: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3615: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3616: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3618: PetscCall(PetscLogGpuTimeBegin());
3619: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3620: /* z = A x + beta y.
3621: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3622: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3623: */
3624: xptr = xarray;
3625: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3626: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3627: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3628: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3629: allocated to accommodate different uses. So we get the length info directly from mat.
3630: */
3631: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3632: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3633: nx = mat->num_cols; // since y = Ax
3634: ny = mat->num_rows;
3635: }
3636: #endif
3637: } else {
3638: /* z = A^T x + beta y
3639: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3640: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3641: */
3642: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3643: dptr = zarray;
3644: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3645: if (compressed) { /* Scatter x to work vector */
3646: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3648: thrust::for_each(
3649: #if PetscDefined(HAVE_THRUST_ASYNC)
3650: thrust::cuda::par.on(PetscDefaultCudaStream),
3651: #endif
3652: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3653: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3654: }
3655: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3656: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3657: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3658: nx = mat->num_rows; // since y = A^T x
3659: ny = mat->num_cols;
3660: }
3661: #endif
3662: }
3664: /* csr_spmv does y = alpha op(A) x + beta y */
3665: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3666: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3667: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3668: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3669: #else
3670: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3671: #endif
3673: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3674: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3675: if (!matDescr) {
3676: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3677: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3678: }
3679: #endif
3681: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3682: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3683: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3684: PetscCallCUSPARSE(
3685: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3686: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3687: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3688: PetscCallCUSPARSE(
3689: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3690: #endif
3691: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3692: } else {
3693: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3694: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3695: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3696: }
3698: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3699: #else
3700: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3701: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3702: #endif
3703: } else {
3704: if (cusparsestruct->nrows) {
3705: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3706: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3707: #else
3708: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3709: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3710: #endif
3711: }
3712: }
3713: PetscCall(PetscLogGpuTimeEnd());
3715: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3716: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3717: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3718: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3719: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3720: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3721: }
3722: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3723: PetscCall(VecSeq_CUDA::Set(zz, 0));
3724: }
3726: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3727: if (compressed) {
3728: PetscCall(PetscLogGpuTimeBegin());
3729: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3730: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3731: prevent that. So I just add a ScatterAdd kernel.
3732: */
3733: #if 0
3734: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3735: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3736: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3737: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3738: VecCUDAPlusEquals());
3739: #else
3740: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3741: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3742: #endif
3743: PetscCall(PetscLogGpuTimeEnd());
3744: }
3745: } else {
3746: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3747: }
3748: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3749: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3750: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3751: } catch (char *ex) {
3752: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3753: }
3754: if (yy) {
3755: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3756: } else {
3757: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3758: }
3759: PetscFunctionReturn(PETSC_SUCCESS);
3760: }
3762: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3763: {
3764: PetscFunctionBegin;
3765: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3766: PetscFunctionReturn(PETSC_SUCCESS);
3767: }
3769: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3770: {
3771: PetscFunctionBegin;
3772: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3773: PetscFunctionReturn(PETSC_SUCCESS);
3774: }
3776: /*@
3777: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3778: (the default parallel PETSc format).
3780: Collective
3782: Input Parameters:
3783: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3784: . m - number of rows
3785: . n - number of columns
3786: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3787: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3789: Output Parameter:
3790: . A - the matrix
3792: Level: intermediate
3794: Notes:
3795: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3796: calculations. For good matrix assembly performance the user should preallocate the matrix
3797: storage by setting the parameter `nz` (or the array `nnz`).
3799: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3800: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3801: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3803: The AIJ format, also called
3804: compressed row storage, is fully compatible with standard Fortran
3805: storage. That is, the stored row and column indices can begin at
3806: either one (as in Fortran) or zero.
3808: Specify the preallocated storage with either nz or nnz (not both).
3809: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3810: allocation.
3812: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3813: @*/
3814: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3815: {
3816: PetscFunctionBegin;
3817: PetscCall(MatCreate(comm, A));
3818: PetscCall(MatSetSizes(*A, m, n, m, n));
3819: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3820: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3821: PetscFunctionReturn(PETSC_SUCCESS);
3822: }
3824: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3825: {
3826: PetscFunctionBegin;
3827: if (A->factortype == MAT_FACTOR_NONE) {
3828: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3829: } else {
3830: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3831: }
3832: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3833: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3834: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3835: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3836: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3837: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3838: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3839: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3840: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3841: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3842: PetscCall(MatDestroy_SeqAIJ(A));
3843: PetscFunctionReturn(PETSC_SUCCESS);
3844: }
3846: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3847: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3848: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3849: {
3850: PetscFunctionBegin;
3851: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3852: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3853: PetscFunctionReturn(PETSC_SUCCESS);
3854: }
3856: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3857: {
3858: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3859: Mat_SeqAIJCUSPARSE *cy;
3860: Mat_SeqAIJCUSPARSE *cx;
3861: PetscScalar *ay;
3862: const PetscScalar *ax;
3863: CsrMatrix *csry, *csrx;
3865: PetscFunctionBegin;
3866: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3867: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3868: if (X->ops->axpy != Y->ops->axpy) {
3869: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3870: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3871: PetscFunctionReturn(PETSC_SUCCESS);
3872: }
3873: /* if we are here, it means both matrices are bound to GPU */
3874: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3875: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3876: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3877: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3878: csry = (CsrMatrix *)cy->mat->mat;
3879: csrx = (CsrMatrix *)cx->mat->mat;
3880: /* see if we can turn this into a cublas axpy */
3881: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3882: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3883: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3884: if (eq) str = SAME_NONZERO_PATTERN;
3885: }
3886: /* spgeam is buggy with one column */
3887: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3889: if (str == SUBSET_NONZERO_PATTERN) {
3890: PetscScalar b = 1.0;
3891: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3892: size_t bufferSize;
3893: void *buffer;
3894: #endif
3896: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3897: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3898: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3899: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3901: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3902: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3903: PetscCall(PetscLogGpuTimeBegin());
3904: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3905: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3906: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3907: PetscCall(PetscLogGpuTimeEnd());
3908: PetscCallCUDA(cudaFree(buffer));
3909: #else
3910: PetscCall(PetscLogGpuTimeBegin());
3911: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3912: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3913: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3914: PetscCall(PetscLogGpuTimeEnd());
3915: #endif
3916: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3917: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3918: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3919: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3920: } else if (str == SAME_NONZERO_PATTERN) {
3921: cublasHandle_t cublasv2handle;
3922: PetscBLASInt one = 1, bnz = 1;
3924: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3925: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3926: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3927: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3928: PetscCall(PetscLogGpuTimeBegin());
3929: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3930: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3931: PetscCall(PetscLogGpuTimeEnd());
3932: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3933: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3934: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3935: } else {
3936: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3937: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3938: }
3939: PetscFunctionReturn(PETSC_SUCCESS);
3940: }
3942: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3943: {
3944: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3945: PetscScalar *ay;
3946: cublasHandle_t cublasv2handle;
3947: PetscBLASInt one = 1, bnz = 1;
3949: PetscFunctionBegin;
3950: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3951: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3952: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3953: PetscCall(PetscLogGpuTimeBegin());
3954: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3955: PetscCall(PetscLogGpuFlops(bnz));
3956: PetscCall(PetscLogGpuTimeEnd());
3957: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3958: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3959: PetscFunctionReturn(PETSC_SUCCESS);
3960: }
3962: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3963: {
3964: PetscBool both = PETSC_FALSE;
3965: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3967: PetscFunctionBegin;
3968: if (A->factortype == MAT_FACTOR_NONE) {
3969: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3970: if (spptr->mat) {
3971: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3972: if (matrix->values) {
3973: both = PETSC_TRUE;
3974: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3975: }
3976: }
3977: if (spptr->matTranspose) {
3978: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3979: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3980: }
3981: }
3982: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3983: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3984: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3985: else A->offloadmask = PETSC_OFFLOAD_CPU;
3986: PetscFunctionReturn(PETSC_SUCCESS);
3987: }
3989: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3990: {
3991: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3993: PetscFunctionBegin;
3994: if (A->factortype != MAT_FACTOR_NONE) {
3995: A->boundtocpu = flg;
3996: PetscFunctionReturn(PETSC_SUCCESS);
3997: }
3998: if (flg) {
3999: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4001: A->ops->scale = MatScale_SeqAIJ;
4002: A->ops->axpy = MatAXPY_SeqAIJ;
4003: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
4004: A->ops->mult = MatMult_SeqAIJ;
4005: A->ops->multadd = MatMultAdd_SeqAIJ;
4006: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
4007: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
4008: A->ops->multhermitiantranspose = NULL;
4009: A->ops->multhermitiantransposeadd = NULL;
4010: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4011: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4012: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4013: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4014: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4015: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4016: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4017: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4018: } else {
4019: A->ops->scale = MatScale_SeqAIJCUSPARSE;
4020: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4021: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4022: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4023: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4024: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4025: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4026: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4027: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4028: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4029: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4030: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4031: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4032: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4033: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4034: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4035: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4037: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4038: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4039: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4040: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4041: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4042: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4043: }
4044: A->boundtocpu = flg;
4045: if (flg && a->inode.size) {
4046: a->inode.use = PETSC_TRUE;
4047: } else {
4048: a->inode.use = PETSC_FALSE;
4049: }
4050: PetscFunctionReturn(PETSC_SUCCESS);
4051: }
4053: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4054: {
4055: Mat B;
4057: PetscFunctionBegin;
4058: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4059: if (reuse == MAT_INITIAL_MATRIX) {
4060: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4061: } else if (reuse == MAT_REUSE_MATRIX) {
4062: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4063: }
4064: B = *newmat;
4066: PetscCall(PetscFree(B->defaultvectype));
4067: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4069: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4070: if (B->factortype == MAT_FACTOR_NONE) {
4071: Mat_SeqAIJCUSPARSE *spptr;
4072: PetscCall(PetscNew(&spptr));
4073: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4074: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4075: spptr->format = MAT_CUSPARSE_CSR;
4076: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4077: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4078: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4079: #else
4080: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4081: #endif
4082: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4083: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4084: #endif
4085: B->spptr = spptr;
4086: } else {
4087: Mat_SeqAIJCUSPARSETriFactors *spptr;
4089: PetscCall(PetscNew(&spptr));
4090: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4091: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4092: B->spptr = spptr;
4093: }
4094: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4095: }
4096: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4097: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4098: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4099: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4100: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4101: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4103: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4104: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4105: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4106: #if defined(PETSC_HAVE_HYPRE)
4107: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4108: #endif
4109: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4110: PetscFunctionReturn(PETSC_SUCCESS);
4111: }
4113: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4114: {
4115: PetscFunctionBegin;
4116: PetscCall(MatCreate_SeqAIJ(B));
4117: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4118: PetscFunctionReturn(PETSC_SUCCESS);
4119: }
4121: /*MC
4122: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4124: A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4125: CSR, ELL, or Hybrid format.
4126: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4128: Options Database Keys:
4129: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4130: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4131: Other options include ell (ellpack) or hyb (hybrid).
4132: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4133: - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4135: Level: beginner
4137: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4138: M*/
4140: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4141: {
4142: PetscFunctionBegin;
4143: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4144: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4145: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4146: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4147: PetscFunctionReturn(PETSC_SUCCESS);
4148: }
4150: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4151: {
4152: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4154: PetscFunctionBegin;
4155: if (cusp) {
4156: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4157: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4158: delete cusp->workVector;
4159: delete cusp->rowoffsets_gpu;
4160: delete cusp->csr2csc_i;
4161: delete cusp->coords;
4162: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4163: PetscCall(PetscFree(mat->spptr));
4164: }
4165: PetscFunctionReturn(PETSC_SUCCESS);
4166: }
4168: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4169: {
4170: PetscFunctionBegin;
4171: if (*mat) {
4172: delete (*mat)->values;
4173: delete (*mat)->column_indices;
4174: delete (*mat)->row_offsets;
4175: delete *mat;
4176: *mat = 0;
4177: }
4178: PetscFunctionReturn(PETSC_SUCCESS);
4179: }
4181: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4182: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4183: {
4184: PetscFunctionBegin;
4185: if (*trifactor) {
4186: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4187: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4188: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4189: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4190: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4191: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4192: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4193: #endif
4194: PetscCall(PetscFree(*trifactor));
4195: }
4196: PetscFunctionReturn(PETSC_SUCCESS);
4197: }
4198: #endif
4200: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4201: {
4202: CsrMatrix *mat;
4204: PetscFunctionBegin;
4205: if (*matstruct) {
4206: if ((*matstruct)->mat) {
4207: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4208: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4210: #else
4211: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4212: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4213: #endif
4214: } else {
4215: mat = (CsrMatrix *)(*matstruct)->mat;
4216: PetscCall(CsrMatrix_Destroy(&mat));
4217: }
4218: }
4219: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4220: delete (*matstruct)->cprowIndices;
4221: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4222: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4223: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4225: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4226: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4227: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4229: for (int i = 0; i < 3; i++) {
4230: if (mdata->cuSpMV[i].initialized) {
4231: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4232: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4233: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4234: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4235: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4236: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4237: #endif
4238: }
4239: }
4240: #endif
4241: delete *matstruct;
4242: *matstruct = NULL;
4243: }
4244: PetscFunctionReturn(PETSC_SUCCESS);
4245: }
4247: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4248: {
4249: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4251: PetscFunctionBegin;
4252: if (fs) {
4253: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4254: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4255: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4256: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4257: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4258: delete fs->workVector;
4259: fs->workVector = NULL;
4260: #endif
4261: delete fs->rpermIndices;
4262: delete fs->cpermIndices;
4263: fs->rpermIndices = NULL;
4264: fs->cpermIndices = NULL;
4265: fs->init_dev_prop = PETSC_FALSE;
4266: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4267: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4268: PetscCallCUDA(cudaFree(fs->csrColIdx));
4269: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4270: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4271: PetscCallCUDA(cudaFree(fs->csrVal));
4272: PetscCallCUDA(cudaFree(fs->diag));
4273: PetscCallCUDA(cudaFree(fs->X));
4274: PetscCallCUDA(cudaFree(fs->Y));
4275: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4276: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4277: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4278: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4279: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4280: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4281: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4282: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4283: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4284: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4285: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4286: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4287: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4288: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4289: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4290: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4291: PetscCall(PetscFree(fs->csrRowPtr_h));
4292: PetscCall(PetscFree(fs->csrVal_h));
4293: PetscCall(PetscFree(fs->diag_h));
4294: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4295: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4296: #endif
4297: }
4298: PetscFunctionReturn(PETSC_SUCCESS);
4299: }
4301: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4302: {
4303: PetscFunctionBegin;
4304: if (*trifactors) {
4305: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4306: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4307: PetscCall(PetscFree(*trifactors));
4308: }
4309: PetscFunctionReturn(PETSC_SUCCESS);
4310: }
4312: struct IJCompare {
4313: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4314: {
4315: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4316: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4317: return false;
4318: }
4319: };
4321: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4322: {
4323: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4325: PetscFunctionBegin;
4326: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4327: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4328: if (destroy) {
4329: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4330: delete cusp->csr2csc_i;
4331: cusp->csr2csc_i = NULL;
4332: }
4333: A->transupdated = PETSC_FALSE;
4334: PetscFunctionReturn(PETSC_SUCCESS);
4335: }
4337: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4338: {
4339: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4341: PetscFunctionBegin;
4342: PetscCallCUDA(cudaFree(coo->perm));
4343: PetscCallCUDA(cudaFree(coo->jmap));
4344: PetscCall(PetscFree(coo));
4345: PetscFunctionReturn(PETSC_SUCCESS);
4346: }
4348: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4349: {
4350: PetscBool dev_ij = PETSC_FALSE;
4351: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4352: PetscInt *i, *j;
4353: PetscContainer container_h;
4354: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4356: PetscFunctionBegin;
4357: PetscCall(PetscGetMemType(coo_i, &mtype));
4358: if (PetscMemTypeDevice(mtype)) {
4359: dev_ij = PETSC_TRUE;
4360: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4361: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4362: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4363: } else {
4364: i = coo_i;
4365: j = coo_j;
4366: }
4368: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4369: if (dev_ij) PetscCall(PetscFree2(i, j));
4370: mat->offloadmask = PETSC_OFFLOAD_CPU;
4371: // Create the GPU memory
4372: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4374: // Copy the COO struct to device
4375: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4376: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4377: PetscCall(PetscMalloc1(1, &coo_d));
4378: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4379: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4380: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4381: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4382: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4384: // Put the COO struct in a container and then attach that to the matrix
4385: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4386: PetscFunctionReturn(PETSC_SUCCESS);
4387: }
4389: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4390: {
4391: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4392: const PetscCount grid_size = gridDim.x * blockDim.x;
4393: for (; i < nnz; i += grid_size) {
4394: PetscScalar sum = 0.0;
4395: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4396: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4397: }
4398: }
4400: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4401: {
4402: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4403: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4404: PetscCount Annz = seq->nz;
4405: PetscMemType memtype;
4406: const PetscScalar *v1 = v;
4407: PetscScalar *Aa;
4408: PetscContainer container;
4409: MatCOOStruct_SeqAIJ *coo;
4411: PetscFunctionBegin;
4412: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4414: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4415: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4417: PetscCall(PetscGetMemType(v, &memtype));
4418: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4419: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4420: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4421: }
4423: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4424: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4426: PetscCall(PetscLogGpuTimeBegin());
4427: if (Annz) {
4428: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4429: PetscCallCUDA(cudaPeekAtLastError());
4430: }
4431: PetscCall(PetscLogGpuTimeEnd());
4433: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4434: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4436: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4437: PetscFunctionReturn(PETSC_SUCCESS);
4438: }
4440: /*@C
4441: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4443: Not Collective
4445: Input Parameters:
4446: + A - the matrix
4447: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4449: Output Parameters:
4450: + i - the CSR row pointers
4451: - j - the CSR column indices
4453: Level: developer
4455: Note:
4456: When compressed is true, the CSR structure does not contain empty rows
4458: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4459: @*/
4460: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4461: {
4462: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4463: CsrMatrix *csr;
4464: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4466: PetscFunctionBegin;
4468: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4469: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4471: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4472: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473: csr = (CsrMatrix *)cusp->mat->mat;
4474: if (i) {
4475: if (!compressed && a->compressedrow.use) { /* need full row offset */
4476: if (!cusp->rowoffsets_gpu) {
4477: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4478: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4479: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4480: }
4481: *i = cusp->rowoffsets_gpu->data().get();
4482: } else *i = csr->row_offsets->data().get();
4483: }
4484: if (j) *j = csr->column_indices->data().get();
4485: PetscFunctionReturn(PETSC_SUCCESS);
4486: }
4488: /*@C
4489: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4491: Not Collective
4493: Input Parameters:
4494: + A - the matrix
4495: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4496: . i - the CSR row pointers
4497: - j - the CSR column indices
4499: Level: developer
4501: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4502: @*/
4503: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4504: {
4505: PetscFunctionBegin;
4507: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4508: if (i) *i = NULL;
4509: if (j) *j = NULL;
4510: (void)compressed;
4511: PetscFunctionReturn(PETSC_SUCCESS);
4512: }
4514: /*@C
4515: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4517: Not Collective
4519: Input Parameter:
4520: . A - a `MATSEQAIJCUSPARSE` matrix
4522: Output Parameter:
4523: . a - pointer to the device data
4525: Level: developer
4527: Note:
4528: May trigger host-device copies if up-to-date matrix data is on host
4530: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4531: @*/
4532: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4533: {
4534: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4535: CsrMatrix *csr;
4537: PetscFunctionBegin;
4539: PetscAssertPointer(a, 2);
4540: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4541: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4542: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4543: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4544: csr = (CsrMatrix *)cusp->mat->mat;
4545: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4546: *a = csr->values->data().get();
4547: PetscFunctionReturn(PETSC_SUCCESS);
4548: }
4550: /*@C
4551: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4553: Not Collective
4555: Input Parameters:
4556: + A - a `MATSEQAIJCUSPARSE` matrix
4557: - a - pointer to the device data
4559: Level: developer
4561: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4562: @*/
4563: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4564: {
4565: PetscFunctionBegin;
4567: PetscAssertPointer(a, 2);
4568: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4569: *a = NULL;
4570: PetscFunctionReturn(PETSC_SUCCESS);
4571: }
4573: /*@C
4574: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4576: Not Collective
4578: Input Parameter:
4579: . A - a `MATSEQAIJCUSPARSE` matrix
4581: Output Parameter:
4582: . a - pointer to the device data
4584: Level: developer
4586: Note:
4587: May trigger host-device copies if up-to-date matrix data is on host
4589: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4590: @*/
4591: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4592: {
4593: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4594: CsrMatrix *csr;
4596: PetscFunctionBegin;
4598: PetscAssertPointer(a, 2);
4599: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4600: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4601: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4602: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4603: csr = (CsrMatrix *)cusp->mat->mat;
4604: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4605: *a = csr->values->data().get();
4606: A->offloadmask = PETSC_OFFLOAD_GPU;
4607: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4608: PetscFunctionReturn(PETSC_SUCCESS);
4609: }
4610: /*@C
4611: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4613: Not Collective
4615: Input Parameters:
4616: + A - a `MATSEQAIJCUSPARSE` matrix
4617: - a - pointer to the device data
4619: Level: developer
4621: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4622: @*/
4623: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4624: {
4625: PetscFunctionBegin;
4627: PetscAssertPointer(a, 2);
4628: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4629: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4630: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4631: *a = NULL;
4632: PetscFunctionReturn(PETSC_SUCCESS);
4633: }
4635: /*@C
4636: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4638: Not Collective
4640: Input Parameter:
4641: . A - a `MATSEQAIJCUSPARSE` matrix
4643: Output Parameter:
4644: . a - pointer to the device data
4646: Level: developer
4648: Note:
4649: Does not trigger host-device copies and flags data validity on the GPU
4651: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4652: @*/
4653: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4654: {
4655: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4656: CsrMatrix *csr;
4658: PetscFunctionBegin;
4660: PetscAssertPointer(a, 2);
4661: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4662: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4663: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4664: csr = (CsrMatrix *)cusp->mat->mat;
4665: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4666: *a = csr->values->data().get();
4667: A->offloadmask = PETSC_OFFLOAD_GPU;
4668: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4669: PetscFunctionReturn(PETSC_SUCCESS);
4670: }
4672: /*@C
4673: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4675: Not Collective
4677: Input Parameters:
4678: + A - a `MATSEQAIJCUSPARSE` matrix
4679: - a - pointer to the device data
4681: Level: developer
4683: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4684: @*/
4685: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4686: {
4687: PetscFunctionBegin;
4689: PetscAssertPointer(a, 2);
4690: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4691: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4692: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4693: *a = NULL;
4694: PetscFunctionReturn(PETSC_SUCCESS);
4695: }
4697: struct IJCompare4 {
4698: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4699: {
4700: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4701: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4702: return false;
4703: }
4704: };
4706: struct Shift {
4707: int _shift;
4709: Shift(int shift) : _shift(shift) { }
4710: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4711: };
4713: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4714: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4715: {
4716: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4717: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4718: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4719: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4720: PetscInt Annz, Bnnz;
4721: cusparseStatus_t stat;
4722: PetscInt i, m, n, zero = 0;
4724: PetscFunctionBegin;
4727: PetscAssertPointer(C, 4);
4728: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4729: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4730: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4731: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4732: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4734: if (reuse == MAT_INITIAL_MATRIX) {
4735: m = A->rmap->n;
4736: n = A->cmap->n + B->cmap->n;
4737: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4738: PetscCall(MatSetSizes(*C, m, n, m, n));
4739: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4740: c = (Mat_SeqAIJ *)(*C)->data;
4741: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4742: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4743: Ccsr = new CsrMatrix;
4744: Cmat->cprowIndices = NULL;
4745: c->compressedrow.use = PETSC_FALSE;
4746: c->compressedrow.nrows = 0;
4747: c->compressedrow.i = NULL;
4748: c->compressedrow.rindex = NULL;
4749: Ccusp->workVector = NULL;
4750: Ccusp->nrows = m;
4751: Ccusp->mat = Cmat;
4752: Ccusp->mat->mat = Ccsr;
4753: Ccsr->num_rows = m;
4754: Ccsr->num_cols = n;
4755: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4756: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4757: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4758: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4759: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4760: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4761: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4763: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4764: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4765: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4766: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4767: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4769: Acsr = (CsrMatrix *)Acusp->mat->mat;
4770: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4771: Annz = (PetscInt)Acsr->column_indices->size();
4772: Bnnz = (PetscInt)Bcsr->column_indices->size();
4773: c->nz = Annz + Bnnz;
4774: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4775: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4776: Ccsr->values = new THRUSTARRAY(c->nz);
4777: Ccsr->num_entries = c->nz;
4778: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4779: if (c->nz) {
4780: auto Acoo = new THRUSTINTARRAY32(Annz);
4781: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4782: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4783: THRUSTINTARRAY32 *Aroff, *Broff;
4785: if (a->compressedrow.use) { /* need full row offset */
4786: if (!Acusp->rowoffsets_gpu) {
4787: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4788: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4789: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4790: }
4791: Aroff = Acusp->rowoffsets_gpu;
4792: } else Aroff = Acsr->row_offsets;
4793: if (b->compressedrow.use) { /* need full row offset */
4794: if (!Bcusp->rowoffsets_gpu) {
4795: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4796: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4797: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4798: }
4799: Broff = Bcusp->rowoffsets_gpu;
4800: } else Broff = Bcsr->row_offsets;
4801: PetscCall(PetscLogGpuTimeBegin());
4802: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4803: PetscCallCUSPARSE(stat);
4804: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4805: PetscCallCUSPARSE(stat);
4806: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4807: auto Aperm = thrust::make_constant_iterator(1);
4808: auto Bperm = thrust::make_constant_iterator(0);
4809: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4810: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4811: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4812: #else
4813: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4814: auto Bcib = Bcsr->column_indices->begin();
4815: auto Bcie = Bcsr->column_indices->end();
4816: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4817: #endif
4818: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4819: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4820: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4821: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4822: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4823: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4824: auto p1 = Ccusp->coords->begin();
4825: auto p2 = Ccusp->coords->begin();
4826: thrust::advance(p2, Annz);
4827: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4828: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4829: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4830: #endif
4831: auto cci = thrust::make_counting_iterator(zero);
4832: auto cce = thrust::make_counting_iterator(c->nz);
4833: #if 0 //Errors on SUMMIT cuda 11.1.0
4834: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4835: #else
4836: auto pred = thrust::identity<int>();
4837: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4838: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4839: #endif
4840: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4841: PetscCallCUSPARSE(stat);
4842: PetscCall(PetscLogGpuTimeEnd());
4843: delete wPerm;
4844: delete Acoo;
4845: delete Bcoo;
4846: delete Ccoo;
4847: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4848: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4849: PetscCallCUSPARSE(stat);
4850: #endif
4851: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4852: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4853: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4854: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4855: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4856: CsrMatrix *CcsrT = new CsrMatrix;
4857: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4858: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4860: (*C)->form_explicit_transpose = PETSC_TRUE;
4861: (*C)->transupdated = PETSC_TRUE;
4862: Ccusp->rowoffsets_gpu = NULL;
4863: CmatT->cprowIndices = NULL;
4864: CmatT->mat = CcsrT;
4865: CcsrT->num_rows = n;
4866: CcsrT->num_cols = m;
4867: CcsrT->num_entries = c->nz;
4869: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4870: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4871: CcsrT->values = new THRUSTARRAY(c->nz);
4873: PetscCall(PetscLogGpuTimeBegin());
4874: auto rT = CcsrT->row_offsets->begin();
4875: if (AT) {
4876: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4877: thrust::advance(rT, -1);
4878: }
4879: if (BT) {
4880: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4881: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4882: thrust::copy(titb, tite, rT);
4883: }
4884: auto cT = CcsrT->column_indices->begin();
4885: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4886: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4887: auto vT = CcsrT->values->begin();
4888: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4889: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4890: PetscCall(PetscLogGpuTimeEnd());
4892: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4893: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4894: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4895: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4896: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4897: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4898: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4899: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4900: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4901: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4902: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4903: PetscCallCUSPARSE(stat);
4904: #endif
4905: Ccusp->matTranspose = CmatT;
4906: }
4907: }
4909: c->free_a = PETSC_TRUE;
4910: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4911: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4912: c->free_ij = PETSC_TRUE;
4913: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4914: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4915: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4916: ii = *Ccsr->row_offsets;
4917: jj = *Ccsr->column_indices;
4918: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4919: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920: } else {
4921: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4922: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923: }
4924: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4925: PetscCall(PetscMalloc1(m, &c->ilen));
4926: PetscCall(PetscMalloc1(m, &c->imax));
4927: c->maxnz = c->nz;
4928: c->nonzerorowcnt = 0;
4929: c->rmax = 0;
4930: for (i = 0; i < m; i++) {
4931: const PetscInt nn = c->i[i + 1] - c->i[i];
4932: c->ilen[i] = c->imax[i] = nn;
4933: c->nonzerorowcnt += (PetscInt)!!nn;
4934: c->rmax = PetscMax(c->rmax, nn);
4935: }
4936: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4937: PetscCall(PetscMalloc1(c->nz, &c->a));
4938: (*C)->nonzerostate++;
4939: PetscCall(PetscLayoutSetUp((*C)->rmap));
4940: PetscCall(PetscLayoutSetUp((*C)->cmap));
4941: Ccusp->nonzerostate = (*C)->nonzerostate;
4942: (*C)->preallocated = PETSC_TRUE;
4943: } else {
4944: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4945: c = (Mat_SeqAIJ *)(*C)->data;
4946: if (c->nz) {
4947: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4948: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4949: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4950: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4951: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4952: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4953: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4954: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4955: Acsr = (CsrMatrix *)Acusp->mat->mat;
4956: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4957: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4958: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4959: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4960: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4961: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4962: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4963: auto pmid = Ccusp->coords->begin();
4964: thrust::advance(pmid, Acsr->num_entries);
4965: PetscCall(PetscLogGpuTimeBegin());
4966: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4967: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4968: thrust::for_each(zibait, zieait, VecCUDAEquals());
4969: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4970: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4971: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4972: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4973: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4974: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4975: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4976: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4977: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4978: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4979: auto vT = CcsrT->values->begin();
4980: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4981: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4982: (*C)->transupdated = PETSC_TRUE;
4983: }
4984: PetscCall(PetscLogGpuTimeEnd());
4985: }
4986: }
4987: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4988: (*C)->assembled = PETSC_TRUE;
4989: (*C)->was_assembled = PETSC_FALSE;
4990: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4991: PetscFunctionReturn(PETSC_SUCCESS);
4992: }
4994: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4995: {
4996: bool dmem;
4997: const PetscScalar *av;
4999: PetscFunctionBegin;
5000: dmem = isCudaMem(v);
5001: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5002: if (n && idx) {
5003: THRUSTINTARRAY widx(n);
5004: widx.assign(idx, idx + n);
5005: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5007: THRUSTARRAY *w = NULL;
5008: thrust::device_ptr<PetscScalar> dv;
5009: if (dmem) {
5010: dv = thrust::device_pointer_cast(v);
5011: } else {
5012: w = new THRUSTARRAY(n);
5013: dv = w->data();
5014: }
5015: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5017: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5018: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5019: thrust::for_each(zibit, zieit, VecCUDAEquals());
5020: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5021: delete w;
5022: } else {
5023: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5024: }
5025: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5026: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5027: PetscFunctionReturn(PETSC_SUCCESS);
5028: }
5029: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()