Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #endif
19: #include <thrust/iterator/constant_iterator.h>
20: #include <thrust/remove.h>
21: #include <thrust/sort.h>
22: #include <thrust/unique.h>
23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24: #include <cuda/std/functional>
25: #endif
27: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
28: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
29: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
30: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
33: typedef enum {
34: CUSPARSE_MV_ALG_DEFAULT = 0,
35: CUSPARSE_COOMV_ALG = 1,
36: CUSPARSE_CSRMV_ALG1 = 2,
37: CUSPARSE_CSRMV_ALG2 = 3
38: } cusparseSpMVAlg_t;
40: typedef enum {
41: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
42: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
43: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
44: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
45: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
46: CUSPARSE_SPMM_ALG_DEFAULT = 0,
47: CUSPARSE_SPMM_COO_ALG1 = 1,
48: CUSPARSE_SPMM_COO_ALG2 = 2,
49: CUSPARSE_SPMM_COO_ALG3 = 3,
50: CUSPARSE_SPMM_COO_ALG4 = 5,
51: CUSPARSE_SPMM_CSR_ALG1 = 4,
52: CUSPARSE_SPMM_CSR_ALG2 = 6,
53: } cusparseSpMMAlg_t;
55: typedef enum {
56: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
57: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
58: } cusparseCsr2CscAlg_t;
59: */
60: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
61: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
62: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
63: #endif
65: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
66: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
67: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
68: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
70: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
71: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
72: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
75: #endif
76: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
77: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
78: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
79: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
80: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
84: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
85: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
87: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
88: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
92: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
95: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
99: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100: {
101: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
103: PetscFunctionBegin;
104: switch (op) {
105: case MAT_CUSPARSE_MULT:
106: cusparsestruct->format = format;
107: break;
108: case MAT_CUSPARSE_ALL:
109: cusparsestruct->format = format;
110: break;
111: default:
112: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113: }
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: /*@
118: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
119: operation. Only the `MatMult()` operation can use different GPU storage formats
121: Not Collective
123: Input Parameters:
124: + A - Matrix of type `MATSEQAIJCUSPARSE`
125: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
126: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
127: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
129: Level: intermediate
131: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
132: @*/
133: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
134: {
135: PetscFunctionBegin;
137: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
138: PetscFunctionReturn(PETSC_SUCCESS);
139: }
141: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142: {
143: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
145: PetscFunctionBegin;
146: cusparsestruct->use_cpu_solve = use_cpu;
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: /*@
151: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
153: Input Parameters:
154: + A - Matrix of type `MATSEQAIJCUSPARSE`
155: - use_cpu - set flag for using the built-in CPU `MatSolve()`
157: Level: intermediate
159: Note:
160: The cuSparse LU solver currently computes the factors with the built-in CPU method
161: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
162: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
164: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
165: @*/
166: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
167: {
168: PetscFunctionBegin;
170: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
171: PetscFunctionReturn(PETSC_SUCCESS);
172: }
174: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175: {
176: PetscFunctionBegin;
177: switch (op) {
178: case MAT_FORM_EXPLICIT_TRANSPOSE:
179: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
180: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
181: A->form_explicit_transpose = flg;
182: break;
183: default:
184: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
185: break;
186: }
187: PetscFunctionReturn(PETSC_SUCCESS);
188: }
190: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
191: {
192: MatCUSPARSEStorageFormat format;
193: PetscBool flg;
194: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
196: PetscFunctionBegin;
197: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
198: if (A->factortype == MAT_FACTOR_NONE) {
199: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
200: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
202: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
203: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
204: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
205: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
206: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
207: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
208: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
209: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
210: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211: #else
212: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
213: #endif
214: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
215: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
217: PetscCall(
218: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
219: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
220: #endif
221: }
222: PetscOptionsHeadEnd();
223: PetscFunctionReturn(PETSC_SUCCESS);
224: }
226: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
227: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
228: {
229: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
230: PetscInt m = A->rmap->n;
231: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
232: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
233: const MatScalar *Aa = a->a;
234: PetscInt *Mi, *Mj, Mnz;
235: PetscScalar *Ma;
237: PetscFunctionBegin;
238: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
239: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
240: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
241: Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
242: PetscCall(PetscMalloc1(m + 1, &Mi));
243: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
244: PetscCall(PetscMalloc1(Mnz, &Ma));
245: Mi[0] = 0;
246: for (PetscInt i = 0; i < m; i++) {
247: PetscInt llen = Ai[i + 1] - Ai[i];
248: PetscInt ulen = Adiag[i] - Adiag[i + 1];
249: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
250: Mj[Mi[i] + llen] = i; // diagonal entry
251: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
252: Mi[i + 1] = Mi[i] + llen + ulen;
253: }
254: // Copy M (L,U) from host to device
255: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
256: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
257: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
258: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
259: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
261: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
262: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
263: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
264: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
265: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
266: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
267: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
268: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
270: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
271: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
272: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
274: fillMode = CUSPARSE_FILL_MODE_UPPER;
275: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
276: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
277: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
278: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
280: // Allocate work vectors in SpSv
281: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
282: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
284: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
285: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
287: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
288: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
289: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
290: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
291: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
292: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
293: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
295: // Record for reuse
296: fs->csrRowPtr_h = Mi;
297: fs->csrVal_h = Ma;
298: PetscCall(PetscFree(Mj));
299: }
300: // Copy the value
301: Mi = fs->csrRowPtr_h;
302: Ma = fs->csrVal_h;
303: Mnz = Mi[m];
304: for (PetscInt i = 0; i < m; i++) {
305: PetscInt llen = Ai[i + 1] - Ai[i];
306: PetscInt ulen = Adiag[i] - Adiag[i + 1];
307: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
308: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry
309: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
310: }
311: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
313: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
314: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
315: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
316: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
317: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
318: } else
319: #endif
320: {
321: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
322: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
324: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
325: fs->updatedSpSVAnalysis = PETSC_TRUE;
326: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
327: }
328: }
329: PetscFunctionReturn(PETSC_SUCCESS);
330: }
331: #else
332: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
333: {
334: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
335: PetscInt n = A->rmap->n;
336: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
337: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
338: const PetscInt *ai = a->i, *aj = a->j, *vi;
339: const MatScalar *aa = a->a, *v;
340: PetscInt *AiLo, *AjLo;
341: PetscInt i, nz, nzLower, offset, rowOffset;
343: PetscFunctionBegin;
344: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
345: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
346: try {
347: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
348: nzLower = n + ai[n] - ai[1];
349: if (!loTriFactor) {
350: PetscScalar *AALo;
352: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
354: /* Allocate Space for the lower triangular matrix */
355: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
356: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
358: /* Fill the lower triangular matrix */
359: AiLo[0] = (PetscInt)0;
360: AiLo[n] = nzLower;
361: AjLo[0] = (PetscInt)0;
362: AALo[0] = (MatScalar)1.0;
363: v = aa;
364: vi = aj;
365: offset = 1;
366: rowOffset = 1;
367: for (i = 1; i < n; i++) {
368: nz = ai[i + 1] - ai[i];
369: /* additional 1 for the term on the diagonal */
370: AiLo[i] = rowOffset;
371: rowOffset += nz + 1;
373: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
374: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
376: offset += nz;
377: AjLo[offset] = (PetscInt)i;
378: AALo[offset] = (MatScalar)1.0;
379: offset += 1;
381: v += nz;
382: vi += nz;
383: }
385: /* allocate space for the triangular factor information */
386: PetscCall(PetscNew(&loTriFactor));
387: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388: /* Create the matrix description */
389: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
390: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
391: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
392: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
393: #else
394: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
395: #endif
396: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
397: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
399: /* set the operation */
400: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402: /* set the matrix */
403: loTriFactor->csrMat = new CsrMatrix;
404: loTriFactor->csrMat->num_rows = n;
405: loTriFactor->csrMat->num_cols = n;
406: loTriFactor->csrMat->num_entries = nzLower;
408: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
409: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
411: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
414: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
417: /* Create the solve analysis information */
418: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
419: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
420: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
421: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
422: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
423: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
424: #endif
426: /* perform the solve analysis */
427: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
428: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
429: PetscCallCUDA(WaitForCUDA());
430: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
432: /* assign the pointer */
433: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
434: loTriFactor->AA_h = AALo;
435: PetscCallCUDA(cudaFreeHost(AiLo));
436: PetscCallCUDA(cudaFreeHost(AjLo));
437: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
438: } else { /* update values only */
439: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
440: /* Fill the lower triangular matrix */
441: loTriFactor->AA_h[0] = 1.0;
442: v = aa;
443: vi = aj;
444: offset = 1;
445: for (i = 1; i < n; i++) {
446: nz = ai[i + 1] - ai[i];
447: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
448: offset += nz;
449: loTriFactor->AA_h[offset] = 1.0;
450: offset += 1;
451: v += nz;
452: }
453: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
454: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
455: }
456: } catch (char *ex) {
457: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
458: }
459: }
460: PetscFunctionReturn(PETSC_SUCCESS);
461: }
463: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
464: {
465: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
466: PetscInt n = A->rmap->n;
467: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
468: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
469: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
470: const MatScalar *aa = a->a, *v;
471: PetscInt *AiUp, *AjUp;
472: PetscInt i, nz, nzUpper, offset;
474: PetscFunctionBegin;
475: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
476: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
477: try {
478: /* next, figure out the number of nonzeros in the upper triangular matrix. */
479: nzUpper = adiag[0] - adiag[n];
480: if (!upTriFactor) {
481: PetscScalar *AAUp;
483: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
485: /* Allocate Space for the upper triangular matrix */
486: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
487: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
489: /* Fill the upper triangular matrix */
490: AiUp[0] = (PetscInt)0;
491: AiUp[n] = nzUpper;
492: offset = nzUpper;
493: for (i = n - 1; i >= 0; i--) {
494: v = aa + adiag[i + 1] + 1;
495: vi = aj + adiag[i + 1] + 1;
497: /* number of elements NOT on the diagonal */
498: nz = adiag[i] - adiag[i + 1] - 1;
500: /* decrement the offset */
501: offset -= (nz + 1);
503: /* first, set the diagonal elements */
504: AjUp[offset] = (PetscInt)i;
505: AAUp[offset] = (MatScalar)1. / v[nz];
506: AiUp[i] = AiUp[i + 1] - (nz + 1);
508: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
509: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
510: }
512: /* allocate space for the triangular factor information */
513: PetscCall(PetscNew(&upTriFactor));
514: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
516: /* Create the matrix description */
517: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
518: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
519: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
520: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
521: #else
522: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
523: #endif
524: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
525: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
527: /* set the operation */
528: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
530: /* set the matrix */
531: upTriFactor->csrMat = new CsrMatrix;
532: upTriFactor->csrMat->num_rows = n;
533: upTriFactor->csrMat->num_cols = n;
534: upTriFactor->csrMat->num_entries = nzUpper;
536: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
537: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
539: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
540: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
542: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
543: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
545: /* Create the solve analysis information */
546: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
547: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
548: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
549: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
550: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
551: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
552: #endif
554: /* perform the solve analysis */
555: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
556: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
558: PetscCallCUDA(WaitForCUDA());
559: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
561: /* assign the pointer */
562: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
563: upTriFactor->AA_h = AAUp;
564: PetscCallCUDA(cudaFreeHost(AiUp));
565: PetscCallCUDA(cudaFreeHost(AjUp));
566: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
567: } else {
568: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
569: /* Fill the upper triangular matrix */
570: offset = nzUpper;
571: for (i = n - 1; i >= 0; i--) {
572: v = aa + adiag[i + 1] + 1;
574: /* number of elements NOT on the diagonal */
575: nz = adiag[i] - adiag[i + 1] - 1;
577: /* decrement the offset */
578: offset -= (nz + 1);
580: /* first, set the diagonal elements */
581: upTriFactor->AA_h[offset] = 1. / v[nz];
582: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
583: }
584: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
585: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
586: }
587: } catch (char *ex) {
588: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
589: }
590: }
591: PetscFunctionReturn(PETSC_SUCCESS);
592: }
593: #endif
595: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
596: {
597: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
598: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
599: IS isrow = a->row, isicol = a->icol;
600: PetscBool row_identity, col_identity;
601: PetscInt n = A->rmap->n;
603: PetscFunctionBegin;
604: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
605: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
606: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
607: #else
608: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
609: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
610: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
611: #endif
613: cusparseTriFactors->nnz = a->nz;
615: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
616: /* lower triangular indices */
617: PetscCall(ISIdentity(isrow, &row_identity));
618: if (!row_identity && !cusparseTriFactors->rpermIndices) {
619: const PetscInt *r;
621: PetscCall(ISGetIndices(isrow, &r));
622: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
623: cusparseTriFactors->rpermIndices->assign(r, r + n);
624: PetscCall(ISRestoreIndices(isrow, &r));
625: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
626: }
628: /* upper triangular indices */
629: PetscCall(ISIdentity(isicol, &col_identity));
630: if (!col_identity && !cusparseTriFactors->cpermIndices) {
631: const PetscInt *c;
633: PetscCall(ISGetIndices(isicol, &c));
634: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
635: cusparseTriFactors->cpermIndices->assign(c, c + n);
636: PetscCall(ISRestoreIndices(isicol, &c));
637: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
638: }
639: PetscFunctionReturn(PETSC_SUCCESS);
640: }
642: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
643: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
644: {
645: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
646: PetscInt m = A->rmap->n;
647: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
648: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
649: const MatScalar *Aa = a->a;
650: PetscInt *Mj, Mnz;
651: PetscScalar *Ma, *D;
653: PetscFunctionBegin;
654: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
655: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
656: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
657: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
658: Mnz = Ai[m]; // Unz (with the unit diagonal)
659: PetscCall(PetscMalloc1(Mnz, &Ma));
660: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
661: PetscCall(PetscMalloc1(m, &D)); // the diagonal
662: for (PetscInt i = 0; i < m; i++) {
663: PetscInt ulen = Ai[i + 1] - Ai[i];
664: Mj[Ai[i]] = i; // diagonal entry
665: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
666: }
667: // Copy M (U) from host to device
668: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
669: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
670: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
671: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
672: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
673: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
675: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
676: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
677: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
678: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
679: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
680: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
681: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
682: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
684: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
685: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
686: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
688: // Allocate work vectors in SpSv
689: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
690: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
692: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
693: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
695: // Query buffer sizes for SpSV and then allocate buffers
696: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
697: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
698: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
700: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
701: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
702: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
704: // Record for reuse
705: fs->csrVal_h = Ma;
706: fs->diag_h = D;
707: PetscCall(PetscFree(Mj));
708: }
709: // Copy the value
710: Ma = fs->csrVal_h;
711: D = fs->diag_h;
712: Mnz = Ai[m];
713: for (PetscInt i = 0; i < m; i++) {
714: D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
715: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
716: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
717: }
718: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
719: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
721: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
722: if (fs->updatedSpSVAnalysis) {
723: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
724: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
725: } else
726: #endif
727: {
728: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
729: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
730: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
731: fs->updatedSpSVAnalysis = PETSC_TRUE;
732: }
733: }
734: PetscFunctionReturn(PETSC_SUCCESS);
735: }
737: // Solve Ut D U x = b
738: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
739: {
740: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
741: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
742: const PetscScalar *barray;
743: PetscScalar *xarray;
744: thrust::device_ptr<const PetscScalar> bGPU;
745: thrust::device_ptr<PetscScalar> xGPU;
746: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
747: PetscInt m = A->rmap->n;
749: PetscFunctionBegin;
750: PetscCall(PetscLogGpuTimeBegin());
751: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
752: PetscCall(VecCUDAGetArrayRead(b, &barray));
753: xGPU = thrust::device_pointer_cast(xarray);
754: bGPU = thrust::device_pointer_cast(barray);
756: // Reorder b with the row permutation if needed, and wrap the result in fs->X
757: if (fs->rpermIndices) {
758: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
759: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
760: } else {
761: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
762: }
764: // Solve Ut Y = X
765: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
766: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
768: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
769: // It is basically a vector element-wise multiplication, but cublas does not have it!
770: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
772: // Solve U X = Y
773: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
774: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
775: } else {
776: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
777: }
778: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
780: // Reorder X with the column permutation if needed, and put the result back to x
781: if (fs->cpermIndices) {
782: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
783: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
784: }
786: PetscCall(VecCUDARestoreArrayRead(b, &barray));
787: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
788: PetscCall(PetscLogGpuTimeEnd());
789: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
790: PetscFunctionReturn(PETSC_SUCCESS);
791: }
792: #else
793: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
794: {
795: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
796: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
797: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
798: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
799: PetscInt *AiUp, *AjUp;
800: PetscScalar *AAUp;
801: PetscScalar *AALo;
802: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
803: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
804: const PetscInt *ai = b->i, *aj = b->j, *vj;
805: const MatScalar *aa = b->a, *v;
807: PetscFunctionBegin;
808: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
809: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
810: try {
811: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
812: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
813: if (!upTriFactor && !loTriFactor) {
814: /* Allocate Space for the upper triangular matrix */
815: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
816: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
818: /* Fill the upper triangular matrix */
819: AiUp[0] = (PetscInt)0;
820: AiUp[n] = nzUpper;
821: offset = 0;
822: for (i = 0; i < n; i++) {
823: /* set the pointers */
824: v = aa + ai[i];
825: vj = aj + ai[i];
826: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
828: /* first, set the diagonal elements */
829: AjUp[offset] = (PetscInt)i;
830: AAUp[offset] = (MatScalar)1.0 / v[nz];
831: AiUp[i] = offset;
832: AALo[offset] = (MatScalar)1.0 / v[nz];
834: offset += 1;
835: if (nz > 0) {
836: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
837: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
838: for (j = offset; j < offset + nz; j++) {
839: AAUp[j] = -AAUp[j];
840: AALo[j] = AAUp[j] / v[nz];
841: }
842: offset += nz;
843: }
844: }
846: /* allocate space for the triangular factor information */
847: PetscCall(PetscNew(&upTriFactor));
848: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
850: /* Create the matrix description */
851: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
852: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
853: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
854: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
855: #else
856: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
857: #endif
858: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
859: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
861: /* set the matrix */
862: upTriFactor->csrMat = new CsrMatrix;
863: upTriFactor->csrMat->num_rows = A->rmap->n;
864: upTriFactor->csrMat->num_cols = A->cmap->n;
865: upTriFactor->csrMat->num_entries = a->nz;
867: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
868: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
870: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
871: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
873: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
874: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
876: /* set the operation */
877: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
879: /* Create the solve analysis information */
880: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
882: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
884: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
885: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
886: #endif
888: /* perform the solve analysis */
889: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
890: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
892: PetscCallCUDA(WaitForCUDA());
893: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895: /* assign the pointer */
896: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
898: /* allocate space for the triangular factor information */
899: PetscCall(PetscNew(&loTriFactor));
900: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
902: /* Create the matrix description */
903: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
904: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
905: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
906: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
907: #else
908: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
909: #endif
910: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
911: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
913: /* set the operation */
914: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
916: /* set the matrix */
917: loTriFactor->csrMat = new CsrMatrix;
918: loTriFactor->csrMat->num_rows = A->rmap->n;
919: loTriFactor->csrMat->num_cols = A->cmap->n;
920: loTriFactor->csrMat->num_entries = a->nz;
922: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
923: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
925: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
926: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
928: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
929: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
931: /* Create the solve analysis information */
932: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
933: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
934: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
935: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
936: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
937: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
938: #endif
940: /* perform the solve analysis */
941: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
942: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
944: PetscCallCUDA(WaitForCUDA());
945: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
947: /* assign the pointer */
948: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
950: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
951: PetscCallCUDA(cudaFreeHost(AiUp));
952: PetscCallCUDA(cudaFreeHost(AjUp));
953: } else {
954: /* Fill the upper triangular matrix */
955: offset = 0;
956: for (i = 0; i < n; i++) {
957: /* set the pointers */
958: v = aa + ai[i];
959: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
961: /* first, set the diagonal elements */
962: AAUp[offset] = 1.0 / v[nz];
963: AALo[offset] = 1.0 / v[nz];
965: offset += 1;
966: if (nz > 0) {
967: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
968: for (j = offset; j < offset + nz; j++) {
969: AAUp[j] = -AAUp[j];
970: AALo[j] = AAUp[j] / v[nz];
971: }
972: offset += nz;
973: }
974: }
975: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
976: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
977: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
978: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
979: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
980: }
981: PetscCallCUDA(cudaFreeHost(AAUp));
982: PetscCallCUDA(cudaFreeHost(AALo));
983: } catch (char *ex) {
984: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
985: }
986: }
987: PetscFunctionReturn(PETSC_SUCCESS);
988: }
989: #endif
991: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
992: {
993: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
994: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
995: IS ip = a->row;
996: PetscBool perm_identity;
997: PetscInt n = A->rmap->n;
999: PetscFunctionBegin;
1000: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
1002: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1003: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1004: #else
1005: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1006: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1007: #endif
1008: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1010: A->offloadmask = PETSC_OFFLOAD_BOTH;
1012: /* lower triangular indices */
1013: PetscCall(ISIdentity(ip, &perm_identity));
1014: if (!perm_identity) {
1015: IS iip;
1016: const PetscInt *irip, *rip;
1018: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1019: PetscCall(ISGetIndices(iip, &irip));
1020: PetscCall(ISGetIndices(ip, &rip));
1021: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1022: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1023: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1024: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1025: PetscCall(ISRestoreIndices(iip, &irip));
1026: PetscCall(ISDestroy(&iip));
1027: PetscCall(ISRestoreIndices(ip, &rip));
1028: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1029: }
1030: PetscFunctionReturn(PETSC_SUCCESS);
1031: }
1033: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1034: {
1035: PetscFunctionBegin;
1036: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1037: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1038: B->offloadmask = PETSC_OFFLOAD_CPU;
1040: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1041: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1042: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1043: #else
1044: /* determine which version of MatSolve needs to be used. */
1045: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1046: IS ip = b->row;
1047: PetscBool perm_identity;
1049: PetscCall(ISIdentity(ip, &perm_identity));
1050: if (perm_identity) {
1051: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1052: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1053: } else {
1054: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1055: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1056: }
1057: #endif
1058: B->ops->matsolve = NULL;
1059: B->ops->matsolvetranspose = NULL;
1061: /* get the triangular factors */
1062: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1063: PetscFunctionReturn(PETSC_SUCCESS);
1064: }
1066: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1067: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1068: {
1069: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1070: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1071: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1072: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1073: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1074: cusparseIndexBase_t indexBase;
1075: cusparseMatrixType_t matrixType;
1076: cusparseFillMode_t fillMode;
1077: cusparseDiagType_t diagType;
1079: PetscFunctionBegin;
1080: /* allocate space for the transpose of the lower triangular factor */
1081: PetscCall(PetscNew(&loTriFactorT));
1082: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1084: /* set the matrix descriptors of the lower triangular factor */
1085: matrixType = cusparseGetMatType(loTriFactor->descr);
1086: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1087: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1088: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1090: /* Create the matrix description */
1091: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1092: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1093: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1094: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1095: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1097: /* set the operation */
1098: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1100: /* allocate GPU space for the CSC of the lower triangular factor*/
1101: loTriFactorT->csrMat = new CsrMatrix;
1102: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1103: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1104: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1105: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1106: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1107: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1109: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1110: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1111: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1112: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1113: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1114: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1115: #endif
1117: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1118: {
1119: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1120: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1121: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1122: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1123: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1124: #else
1125: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1126: #endif
1127: PetscCallCUSPARSE(stat);
1128: }
1130: PetscCallCUDA(WaitForCUDA());
1131: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1133: /* Create the solve analysis information */
1134: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1135: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1136: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1137: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1138: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1139: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1140: #endif
1142: /* perform the solve analysis */
1143: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1144: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1146: PetscCallCUDA(WaitForCUDA());
1147: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1149: /* assign the pointer */
1150: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1152: /*********************************************/
1153: /* Now the Transpose of the Upper Tri Factor */
1154: /*********************************************/
1156: /* allocate space for the transpose of the upper triangular factor */
1157: PetscCall(PetscNew(&upTriFactorT));
1158: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1160: /* set the matrix descriptors of the upper triangular factor */
1161: matrixType = cusparseGetMatType(upTriFactor->descr);
1162: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1163: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1164: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1166: /* Create the matrix description */
1167: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1168: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1169: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1170: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1171: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1173: /* set the operation */
1174: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1176: /* allocate GPU space for the CSC of the upper triangular factor*/
1177: upTriFactorT->csrMat = new CsrMatrix;
1178: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1179: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1180: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1181: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1182: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1183: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1185: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1186: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1187: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1188: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1189: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1190: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1191: #endif
1193: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1194: {
1195: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1196: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1197: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1198: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1199: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1200: #else
1201: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1202: #endif
1203: PetscCallCUSPARSE(stat);
1204: }
1206: PetscCallCUDA(WaitForCUDA());
1207: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1209: /* Create the solve analysis information */
1210: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1211: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1212: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1213: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1214: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1215: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1216: #endif
1218: /* perform the solve analysis */
1219: /* christ, would it have killed you to put this stuff in a function????????? */
1220: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1221: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1223: PetscCallCUDA(WaitForCUDA());
1224: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1226: /* assign the pointer */
1227: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1228: PetscFunctionReturn(PETSC_SUCCESS);
1229: }
1230: #endif
1232: struct PetscScalarToPetscInt {
1233: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1234: };
1236: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1237: {
1238: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1239: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1240: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1241: cusparseStatus_t stat;
1242: cusparseIndexBase_t indexBase;
1244: PetscFunctionBegin;
1245: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1246: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1247: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1248: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1249: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1250: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1251: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1252: PetscCall(PetscLogGpuTimeBegin());
1253: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1254: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1255: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1256: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1257: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1259: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1261: /* set alpha and beta */
1262: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1263: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1264: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1265: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1267: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1269: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1270: CsrMatrix *matrixT = new CsrMatrix;
1271: matstructT->mat = matrixT;
1272: matrixT->num_rows = A->cmap->n;
1273: matrixT->num_cols = A->rmap->n;
1274: matrixT->num_entries = a->nz;
1275: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1276: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1277: matrixT->values = new THRUSTARRAY(a->nz);
1279: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1280: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1282: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1284: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1285: indexBase, cusparse_scalartype);
1286: PetscCallCUSPARSE(stat);
1287: #else
1288: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1289: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1291: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1292: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1293: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1294: */
1295: if (matrixT->num_entries) {
1296: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1297: PetscCallCUSPARSE(stat);
1299: } else {
1300: matstructT->matDescr = NULL;
1301: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1302: }
1303: #endif
1304: #endif
1305: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1306: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1307: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1308: #else
1309: CsrMatrix *temp = new CsrMatrix;
1310: CsrMatrix *tempT = new CsrMatrix;
1311: /* First convert HYB to CSR */
1312: temp->num_rows = A->rmap->n;
1313: temp->num_cols = A->cmap->n;
1314: temp->num_entries = a->nz;
1315: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1316: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1317: temp->values = new THRUSTARRAY(a->nz);
1319: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1320: PetscCallCUSPARSE(stat);
1322: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1323: tempT->num_rows = A->rmap->n;
1324: tempT->num_cols = A->cmap->n;
1325: tempT->num_entries = a->nz;
1326: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1327: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1328: tempT->values = new THRUSTARRAY(a->nz);
1330: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1331: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1332: PetscCallCUSPARSE(stat);
1334: /* Last, convert CSC to HYB */
1335: cusparseHybMat_t hybMat;
1336: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1337: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1338: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1339: PetscCallCUSPARSE(stat);
1341: /* assign the pointer */
1342: matstructT->mat = hybMat;
1343: A->transupdated = PETSC_TRUE;
1344: /* delete temporaries */
1345: if (tempT) {
1346: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1347: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1348: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1349: delete (CsrMatrix *)tempT;
1350: }
1351: if (temp) {
1352: if (temp->values) delete (THRUSTARRAY *)temp->values;
1353: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1354: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1355: delete (CsrMatrix *)temp;
1356: }
1357: #endif
1358: }
1359: }
1360: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1361: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1362: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1363: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1364: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1365: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1366: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1367: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1368: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1369: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1370: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1371: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1372: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1373: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1374: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1375: }
1376: if (!cusparsestruct->csr2csc_i) {
1377: THRUSTARRAY csr2csc_a(matrix->num_entries);
1378: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1380: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1381: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1382: void *csr2cscBuffer;
1383: size_t csr2cscBufferSize;
1384: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1385: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1386: PetscCallCUSPARSE(stat);
1387: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1388: #endif
1390: if (matrix->num_entries) {
1391: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1392: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1393: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1395: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1396: should be filled with indexBase. So I just take a shortcut here.
1397: */
1398: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1399: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1400: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1401: PetscCallCUSPARSE(stat);
1402: #else
1403: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1404: PetscCallCUSPARSE(stat);
1405: #endif
1406: } else {
1407: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1408: }
1410: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1411: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1412: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1413: PetscCallCUDA(cudaFree(csr2cscBuffer));
1414: #endif
1415: }
1416: PetscCallThrust(
1417: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1418: }
1419: PetscCall(PetscLogGpuTimeEnd());
1420: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1421: /* the compressed row indices is not used for matTranspose */
1422: matstructT->cprowIndices = NULL;
1423: /* assign the pointer */
1424: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1425: A->transupdated = PETSC_TRUE;
1426: PetscFunctionReturn(PETSC_SUCCESS);
1427: }
1429: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1430: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1431: {
1432: const PetscScalar *barray;
1433: PetscScalar *xarray;
1434: thrust::device_ptr<const PetscScalar> bGPU;
1435: thrust::device_ptr<PetscScalar> xGPU;
1436: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1437: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1438: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1439: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1440: PetscInt m = A->rmap->n;
1442: PetscFunctionBegin;
1443: PetscCall(PetscLogGpuTimeBegin());
1444: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1445: PetscCall(VecCUDAGetArrayRead(b, &barray));
1446: xGPU = thrust::device_pointer_cast(xarray);
1447: bGPU = thrust::device_pointer_cast(barray);
1449: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1450: if (fs->rpermIndices) {
1451: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1452: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1453: } else {
1454: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1455: }
1457: // Solve L Y = X
1458: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1459: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1460: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1462: // Solve U X = Y
1463: if (fs->cpermIndices) {
1464: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1465: } else {
1466: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1467: }
1468: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1470: // Reorder X with the column permutation if needed, and put the result back to x
1471: if (fs->cpermIndices) {
1472: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1473: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1474: }
1475: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1476: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1477: PetscCall(PetscLogGpuTimeEnd());
1478: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1479: PetscFunctionReturn(PETSC_SUCCESS);
1480: }
1482: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1483: {
1484: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1485: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1486: const PetscScalar *barray;
1487: PetscScalar *xarray;
1488: thrust::device_ptr<const PetscScalar> bGPU;
1489: thrust::device_ptr<PetscScalar> xGPU;
1490: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1491: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1492: PetscInt m = A->rmap->n;
1494: PetscFunctionBegin;
1495: PetscCall(PetscLogGpuTimeBegin());
1496: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1497: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1498: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1499: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1501: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1502: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1503: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1504: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1505: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1506: }
1508: if (!fs->updatedTransposeSpSVAnalysis) {
1509: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1511: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1512: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1513: }
1515: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1516: PetscCall(VecCUDAGetArrayRead(b, &barray));
1517: xGPU = thrust::device_pointer_cast(xarray);
1518: bGPU = thrust::device_pointer_cast(barray);
1520: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1521: if (fs->rpermIndices) {
1522: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1523: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1524: } else {
1525: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1526: }
1528: // Solve Ut Y = X
1529: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1530: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1532: // Solve Lt X = Y
1533: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1534: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1535: } else {
1536: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1537: }
1538: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1540: // Reorder X with the column permutation if needed, and put the result back to x
1541: if (fs->cpermIndices) {
1542: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1543: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1544: }
1546: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1547: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1548: PetscCall(PetscLogGpuTimeEnd());
1549: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1550: PetscFunctionReturn(PETSC_SUCCESS);
1551: }
1552: #else
1553: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1554: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1555: {
1556: PetscInt n = xx->map->n;
1557: const PetscScalar *barray;
1558: PetscScalar *xarray;
1559: thrust::device_ptr<const PetscScalar> bGPU;
1560: thrust::device_ptr<PetscScalar> xGPU;
1561: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1562: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1563: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1564: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1566: PetscFunctionBegin;
1567: /* Analyze the matrix and create the transpose ... on the fly */
1568: if (!loTriFactorT && !upTriFactorT) {
1569: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1570: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1571: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1572: }
1574: /* Get the GPU pointers */
1575: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1576: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1577: xGPU = thrust::device_pointer_cast(xarray);
1578: bGPU = thrust::device_pointer_cast(barray);
1580: PetscCall(PetscLogGpuTimeBegin());
1581: /* First, reorder with the row permutation */
1582: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1584: /* First, solve U */
1585: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1586: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1588: /* Then, solve L */
1589: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1590: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1592: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1593: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1595: /* Copy the temporary to the full solution. */
1596: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1598: /* restore */
1599: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1600: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1601: PetscCall(PetscLogGpuTimeEnd());
1602: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1603: PetscFunctionReturn(PETSC_SUCCESS);
1604: }
1606: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1607: {
1608: const PetscScalar *barray;
1609: PetscScalar *xarray;
1610: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1611: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1612: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1613: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1615: PetscFunctionBegin;
1616: /* Analyze the matrix and create the transpose ... on the fly */
1617: if (!loTriFactorT && !upTriFactorT) {
1618: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1619: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1620: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1621: }
1623: /* Get the GPU pointers */
1624: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1625: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1627: PetscCall(PetscLogGpuTimeBegin());
1628: /* First, solve U */
1629: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1630: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1632: /* Then, solve L */
1633: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1634: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1636: /* restore */
1637: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1638: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1639: PetscCall(PetscLogGpuTimeEnd());
1640: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1641: PetscFunctionReturn(PETSC_SUCCESS);
1642: }
1644: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1645: {
1646: const PetscScalar *barray;
1647: PetscScalar *xarray;
1648: thrust::device_ptr<const PetscScalar> bGPU;
1649: thrust::device_ptr<PetscScalar> xGPU;
1650: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1651: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1652: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1653: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1655: PetscFunctionBegin;
1656: /* Get the GPU pointers */
1657: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1658: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1659: xGPU = thrust::device_pointer_cast(xarray);
1660: bGPU = thrust::device_pointer_cast(barray);
1662: PetscCall(PetscLogGpuTimeBegin());
1663: /* First, reorder with the row permutation */
1664: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1666: /* Next, solve L */
1667: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1668: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1670: /* Then, solve U */
1671: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1672: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1674: /* Last, reorder with the column permutation */
1675: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1677: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1678: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1679: PetscCall(PetscLogGpuTimeEnd());
1680: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1681: PetscFunctionReturn(PETSC_SUCCESS);
1682: }
1684: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1685: {
1686: const PetscScalar *barray;
1687: PetscScalar *xarray;
1688: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1689: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1690: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1691: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1693: PetscFunctionBegin;
1694: /* Get the GPU pointers */
1695: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1696: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1698: PetscCall(PetscLogGpuTimeBegin());
1699: /* First, solve L */
1700: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1701: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1703: /* Next, solve U */
1704: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1705: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1707: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1708: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1709: PetscCall(PetscLogGpuTimeEnd());
1710: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1711: PetscFunctionReturn(PETSC_SUCCESS);
1712: }
1713: #endif
1715: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1716: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1717: {
1718: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1719: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1720: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1721: CsrMatrix *Acsr;
1722: PetscInt m, nz;
1723: PetscBool flg;
1725: PetscFunctionBegin;
1726: if (PetscDefined(USE_DEBUG)) {
1727: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1728: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1729: }
1731: /* Copy A's value to fact */
1732: m = fact->rmap->n;
1733: nz = aij->nz;
1734: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1735: Acsr = (CsrMatrix *)Acusp->mat->mat;
1736: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1738: PetscCall(PetscLogGpuTimeBegin());
1739: /* Factorize fact inplace */
1740: if (m)
1741: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1742: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1743: if (PetscDefined(USE_DEBUG)) {
1744: int numerical_zero;
1745: cusparseStatus_t status;
1746: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1747: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1748: }
1750: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1751: if (fs->updatedSpSVAnalysis) {
1752: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1753: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1754: } else
1755: #endif
1756: {
1757: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1758: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1759: */
1760: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1762: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1764: fs->updatedSpSVAnalysis = PETSC_TRUE;
1765: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1766: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1767: }
1769: fact->offloadmask = PETSC_OFFLOAD_GPU;
1770: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1771: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1772: fact->ops->matsolve = NULL;
1773: fact->ops->matsolvetranspose = NULL;
1774: PetscCall(PetscLogGpuTimeEnd());
1775: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1776: PetscFunctionReturn(PETSC_SUCCESS);
1777: }
1779: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1780: {
1781: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1782: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1783: PetscInt m, nz;
1785: PetscFunctionBegin;
1786: if (PetscDefined(USE_DEBUG)) {
1787: PetscInt i;
1788: PetscBool flg, missing;
1790: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1791: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1792: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1793: PetscCall(MatMissingDiagonal(A, &missing, &i));
1794: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1795: }
1797: /* Free the old stale stuff */
1798: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1800: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1801: but they will not be used. Allocate them just for easy debugging.
1802: */
1803: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1805: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1806: fact->factortype = MAT_FACTOR_ILU;
1807: fact->info.factor_mallocs = 0;
1808: fact->info.fill_ratio_given = info->fill;
1809: fact->info.fill_ratio_needed = 1.0;
1811: aij->row = NULL;
1812: aij->col = NULL;
1814: /* ====================================================================== */
1815: /* Copy A's i, j to fact and also allocate the value array of fact. */
1816: /* We'll do in-place factorization on fact */
1817: /* ====================================================================== */
1818: const int *Ai, *Aj;
1820: m = fact->rmap->n;
1821: nz = aij->nz;
1823: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1824: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1825: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1826: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1827: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1828: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1830: /* ====================================================================== */
1831: /* Create descriptors for M, L, U */
1832: /* ====================================================================== */
1833: cusparseFillMode_t fillMode;
1834: cusparseDiagType_t diagType;
1836: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1837: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1838: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1840: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1841: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1842: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1843: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1844: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1845: */
1846: fillMode = CUSPARSE_FILL_MODE_LOWER;
1847: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1848: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1849: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1850: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1852: fillMode = CUSPARSE_FILL_MODE_UPPER;
1853: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1854: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1855: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1856: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1858: /* ========================================================================= */
1859: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1860: /* ========================================================================= */
1861: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1862: if (m)
1863: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1864: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1866: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1867: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1869: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1870: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1872: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1873: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1875: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1876: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1878: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1879: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1880: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1881: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1882: */
1883: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1884: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1885: fs->spsvBuffer_L = fs->factBuffer_M;
1886: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1887: } else {
1888: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1889: fs->spsvBuffer_U = fs->factBuffer_M;
1890: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1891: }
1893: /* ========================================================================== */
1894: /* Perform analysis of ilu0 on M, SpSv on L and U */
1895: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1896: /* ========================================================================== */
1897: int structural_zero;
1898: cusparseStatus_t status;
1900: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1901: if (m)
1902: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1903: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1904: if (PetscDefined(USE_DEBUG)) {
1905: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1906: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1907: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1908: }
1910: /* Estimate FLOPs of the numeric factorization */
1911: {
1912: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1913: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1914: PetscLogDouble flops = 0.0;
1916: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1917: Ai = Aseq->i;
1918: Adiag = Aseq->diag;
1919: for (PetscInt i = 0; i < m; i++) {
1920: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1921: nzRow = Ai[i + 1] - Ai[i];
1922: nzLeft = Adiag[i] - Ai[i];
1923: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1924: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1925: */
1926: nzLeft = (nzRow - 1) / 2;
1927: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1928: }
1929: }
1930: fs->numericFactFlops = flops;
1931: }
1932: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1933: PetscFunctionReturn(PETSC_SUCCESS);
1934: }
1936: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1937: {
1938: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1940: const PetscScalar *barray;
1941: PetscScalar *xarray;
1943: PetscFunctionBegin;
1944: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1945: PetscCall(VecCUDAGetArrayRead(b, &barray));
1946: PetscCall(PetscLogGpuTimeBegin());
1948: /* Solve L*y = b */
1949: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1950: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1951: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1952: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1954: /* Solve Lt*x = y */
1955: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1956: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1957: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1959: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1960: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1962: PetscCall(PetscLogGpuTimeEnd());
1963: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1964: PetscFunctionReturn(PETSC_SUCCESS);
1965: }
1967: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1968: {
1969: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1970: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1971: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1972: CsrMatrix *Acsr;
1973: PetscInt m, nz;
1974: PetscBool flg;
1976: PetscFunctionBegin;
1977: if (PetscDefined(USE_DEBUG)) {
1978: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1979: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1980: }
1982: /* Copy A's value to fact */
1983: m = fact->rmap->n;
1984: nz = aij->nz;
1985: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1986: Acsr = (CsrMatrix *)Acusp->mat->mat;
1987: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1989: /* Factorize fact inplace */
1990: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1991: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1992: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1993: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1994: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1995: */
1996: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1997: if (PetscDefined(USE_DEBUG)) {
1998: int numerical_zero;
1999: cusparseStatus_t status;
2000: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2001: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2002: }
2004: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2005: if (fs->updatedSpSVAnalysis) {
2006: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2007: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2008: } else
2009: #endif
2010: {
2011: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2013: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2014: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2015: */
2016: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2017: fs->updatedSpSVAnalysis = PETSC_TRUE;
2018: }
2020: fact->offloadmask = PETSC_OFFLOAD_GPU;
2021: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
2022: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2023: fact->ops->matsolve = NULL;
2024: fact->ops->matsolvetranspose = NULL;
2025: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2026: PetscFunctionReturn(PETSC_SUCCESS);
2027: }
2029: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2030: {
2031: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2032: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2033: PetscInt m, nz;
2035: PetscFunctionBegin;
2036: if (PetscDefined(USE_DEBUG)) {
2037: PetscInt i;
2038: PetscBool flg, missing;
2040: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2041: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2042: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2043: PetscCall(MatMissingDiagonal(A, &missing, &i));
2044: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2045: }
2047: /* Free the old stale stuff */
2048: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2050: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2051: but they will not be used. Allocate them just for easy debugging.
2052: */
2053: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2055: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2056: fact->factortype = MAT_FACTOR_ICC;
2057: fact->info.factor_mallocs = 0;
2058: fact->info.fill_ratio_given = info->fill;
2059: fact->info.fill_ratio_needed = 1.0;
2061: aij->row = NULL;
2062: aij->col = NULL;
2064: /* ====================================================================== */
2065: /* Copy A's i, j to fact and also allocate the value array of fact. */
2066: /* We'll do in-place factorization on fact */
2067: /* ====================================================================== */
2068: const int *Ai, *Aj;
2070: m = fact->rmap->n;
2071: nz = aij->nz;
2073: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2074: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2075: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2076: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2077: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2078: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2080: /* ====================================================================== */
2081: /* Create mat descriptors for M, L */
2082: /* ====================================================================== */
2083: cusparseFillMode_t fillMode;
2084: cusparseDiagType_t diagType;
2086: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2087: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2088: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2090: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2091: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2092: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2093: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2094: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2095: */
2096: fillMode = CUSPARSE_FILL_MODE_LOWER;
2097: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2098: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2099: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2100: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2102: /* ========================================================================= */
2103: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2104: /* ========================================================================= */
2105: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2106: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2108: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2109: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2111: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2112: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2114: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2115: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2117: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2118: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2120: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2121: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2122: */
2123: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2124: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2125: fs->spsvBuffer_L = fs->factBuffer_M;
2126: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2127: } else {
2128: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2129: fs->spsvBuffer_Lt = fs->factBuffer_M;
2130: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2131: }
2133: /* ========================================================================== */
2134: /* Perform analysis of ic0 on M */
2135: /* The lower triangular part of M has the same sparsity pattern as L */
2136: /* ========================================================================== */
2137: int structural_zero;
2138: cusparseStatus_t status;
2140: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2141: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2142: if (PetscDefined(USE_DEBUG)) {
2143: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2144: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2145: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2146: }
2148: /* Estimate FLOPs of the numeric factorization */
2149: {
2150: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2151: PetscInt *Ai, nzRow, nzLeft;
2152: PetscLogDouble flops = 0.0;
2154: Ai = Aseq->i;
2155: for (PetscInt i = 0; i < m; i++) {
2156: nzRow = Ai[i + 1] - Ai[i];
2157: if (nzRow > 1) {
2158: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2159: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2160: */
2161: nzLeft = (nzRow - 1) / 2;
2162: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2163: }
2164: }
2165: fs->numericFactFlops = flops;
2166: }
2167: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2168: PetscFunctionReturn(PETSC_SUCCESS);
2169: }
2170: #endif
2172: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2173: {
2174: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2175: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2177: PetscFunctionBegin;
2178: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2179: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2180: B->offloadmask = PETSC_OFFLOAD_CPU;
2182: if (!cusparsestruct->use_cpu_solve) {
2183: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2184: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2185: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2186: #else
2187: /* determine which version of MatSolve needs to be used. */
2188: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2189: IS isrow = b->row, iscol = b->col;
2190: PetscBool row_identity, col_identity;
2192: PetscCall(ISIdentity(isrow, &row_identity));
2193: PetscCall(ISIdentity(iscol, &col_identity));
2194: if (row_identity && col_identity) {
2195: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2196: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2197: } else {
2198: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2199: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2200: }
2201: #endif
2202: }
2203: B->ops->matsolve = NULL;
2204: B->ops->matsolvetranspose = NULL;
2206: /* get the triangular factors */
2207: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2208: PetscFunctionReturn(PETSC_SUCCESS);
2209: }
2211: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2212: {
2213: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2215: PetscFunctionBegin;
2216: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2217: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2218: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2219: PetscFunctionReturn(PETSC_SUCCESS);
2220: }
2222: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2223: {
2224: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2226: PetscFunctionBegin;
2227: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2229: if (!info->factoronhost) {
2230: PetscCall(ISIdentity(isrow, &row_identity));
2231: PetscCall(ISIdentity(iscol, &col_identity));
2232: }
2233: if (!info->levels && row_identity && col_identity) {
2234: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2235: } else
2236: #endif
2237: {
2238: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2239: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2240: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2241: }
2242: PetscFunctionReturn(PETSC_SUCCESS);
2243: }
2245: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2246: {
2247: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2249: PetscFunctionBegin;
2250: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2251: PetscBool perm_identity = PETSC_FALSE;
2252: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2253: if (!info->levels && perm_identity) {
2254: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2255: } else
2256: #endif
2257: {
2258: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2259: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2260: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2261: }
2262: PetscFunctionReturn(PETSC_SUCCESS);
2263: }
2265: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2266: {
2267: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2269: PetscFunctionBegin;
2270: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2271: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2272: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2273: PetscFunctionReturn(PETSC_SUCCESS);
2274: }
2276: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2277: {
2278: PetscFunctionBegin;
2279: *type = MATSOLVERCUSPARSE;
2280: PetscFunctionReturn(PETSC_SUCCESS);
2281: }
2283: /*MC
2284: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2285: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2286: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2287: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2288: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2289: algorithms are not recommended. This class does NOT support direct solver operations.
2291: Level: beginner
2293: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2294: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2295: M*/
2297: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2298: {
2299: PetscInt n = A->rmap->n;
2301: PetscFunctionBegin;
2302: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2303: PetscCall(MatSetSizes(*B, n, n, n, n));
2304: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2305: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2307: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2308: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2309: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2310: if (!A->boundtocpu) {
2311: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2312: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2313: } else {
2314: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2315: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2316: }
2317: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2318: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2319: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2320: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2321: if (!A->boundtocpu) {
2322: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2323: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2324: } else {
2325: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2326: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2327: }
2328: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2329: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2330: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2332: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2333: (*B)->canuseordering = PETSC_TRUE;
2334: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2335: PetscFunctionReturn(PETSC_SUCCESS);
2336: }
2338: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2339: {
2340: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2341: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2342: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2343: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2344: #endif
2346: PetscFunctionBegin;
2347: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2348: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2349: if (A->factortype == MAT_FACTOR_NONE) {
2350: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2351: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2352: }
2353: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2354: else if (fs->csrVal) {
2355: /* We have a factorized matrix on device and are able to copy it to host */
2356: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2357: }
2358: #endif
2359: else
2360: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2361: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2362: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2363: A->offloadmask = PETSC_OFFLOAD_BOTH;
2364: }
2365: PetscFunctionReturn(PETSC_SUCCESS);
2366: }
2368: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2369: {
2370: PetscFunctionBegin;
2371: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2372: *array = ((Mat_SeqAIJ *)A->data)->a;
2373: PetscFunctionReturn(PETSC_SUCCESS);
2374: }
2376: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377: {
2378: PetscFunctionBegin;
2379: A->offloadmask = PETSC_OFFLOAD_CPU;
2380: *array = NULL;
2381: PetscFunctionReturn(PETSC_SUCCESS);
2382: }
2384: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2385: {
2386: PetscFunctionBegin;
2387: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2388: *array = ((Mat_SeqAIJ *)A->data)->a;
2389: PetscFunctionReturn(PETSC_SUCCESS);
2390: }
2392: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2393: {
2394: PetscFunctionBegin;
2395: *array = NULL;
2396: PetscFunctionReturn(PETSC_SUCCESS);
2397: }
2399: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2400: {
2401: PetscFunctionBegin;
2402: *array = ((Mat_SeqAIJ *)A->data)->a;
2403: PetscFunctionReturn(PETSC_SUCCESS);
2404: }
2406: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2407: {
2408: PetscFunctionBegin;
2409: A->offloadmask = PETSC_OFFLOAD_CPU;
2410: *array = NULL;
2411: PetscFunctionReturn(PETSC_SUCCESS);
2412: }
2414: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2415: {
2416: Mat_SeqAIJCUSPARSE *cusp;
2417: CsrMatrix *matrix;
2419: PetscFunctionBegin;
2420: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2421: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2422: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2423: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2424: matrix = (CsrMatrix *)cusp->mat->mat;
2426: if (i) {
2427: #if !defined(PETSC_USE_64BIT_INDICES)
2428: *i = matrix->row_offsets->data().get();
2429: #else
2430: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2431: #endif
2432: }
2433: if (j) {
2434: #if !defined(PETSC_USE_64BIT_INDICES)
2435: *j = matrix->column_indices->data().get();
2436: #else
2437: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2438: #endif
2439: }
2440: if (a) *a = matrix->values->data().get();
2441: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2442: PetscFunctionReturn(PETSC_SUCCESS);
2443: }
2445: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2446: {
2447: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2448: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2449: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2450: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2451: cusparseStatus_t stat;
2452: PetscBool both = PETSC_TRUE;
2454: PetscFunctionBegin;
2455: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2456: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2457: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2458: CsrMatrix *matrix;
2459: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2461: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2462: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2463: matrix->values->assign(a->a, a->a + a->nz);
2464: PetscCallCUDA(WaitForCUDA());
2465: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2466: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2467: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2468: } else {
2469: PetscInt nnz;
2470: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2471: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2472: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2473: delete cusparsestruct->workVector;
2474: delete cusparsestruct->rowoffsets_gpu;
2475: cusparsestruct->workVector = NULL;
2476: cusparsestruct->rowoffsets_gpu = NULL;
2477: try {
2478: if (a->compressedrow.use) {
2479: m = a->compressedrow.nrows;
2480: ii = a->compressedrow.i;
2481: ridx = a->compressedrow.rindex;
2482: } else {
2483: m = A->rmap->n;
2484: ii = a->i;
2485: ridx = NULL;
2486: }
2487: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2488: if (!a->a) {
2489: nnz = ii[m];
2490: both = PETSC_FALSE;
2491: } else nnz = a->nz;
2492: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2494: /* create cusparse matrix */
2495: cusparsestruct->nrows = m;
2496: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2497: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2498: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2499: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2501: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2502: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2503: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2504: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2505: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2506: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2507: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2509: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2510: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2511: /* set the matrix */
2512: CsrMatrix *mat = new CsrMatrix;
2513: mat->num_rows = m;
2514: mat->num_cols = A->cmap->n;
2515: mat->num_entries = nnz;
2516: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517: mat->row_offsets->assign(ii, ii + m + 1);
2519: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520: mat->column_indices->assign(a->j, a->j + nnz);
2522: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523: if (a->a) mat->values->assign(a->a, a->a + nnz);
2525: /* assign the pointer */
2526: matstruct->mat = mat;
2527: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2528: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2529: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2530: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2531: PetscCallCUSPARSE(stat);
2532: }
2533: #endif
2534: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2535: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2536: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2537: #else
2538: CsrMatrix *mat = new CsrMatrix;
2539: mat->num_rows = m;
2540: mat->num_cols = A->cmap->n;
2541: mat->num_entries = nnz;
2542: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2543: mat->row_offsets->assign(ii, ii + m + 1);
2545: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2546: mat->column_indices->assign(a->j, a->j + nnz);
2548: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2549: if (a->a) mat->values->assign(a->a, a->a + nnz);
2551: cusparseHybMat_t hybMat;
2552: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2553: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2554: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2555: PetscCallCUSPARSE(stat);
2556: /* assign the pointer */
2557: matstruct->mat = hybMat;
2559: if (mat) {
2560: if (mat->values) delete (THRUSTARRAY *)mat->values;
2561: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2562: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2563: delete (CsrMatrix *)mat;
2564: }
2565: #endif
2566: }
2568: /* assign the compressed row indices */
2569: if (a->compressedrow.use) {
2570: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2571: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2572: matstruct->cprowIndices->assign(ridx, ridx + m);
2573: tmp = m;
2574: } else {
2575: cusparsestruct->workVector = NULL;
2576: matstruct->cprowIndices = NULL;
2577: tmp = 0;
2578: }
2579: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2581: /* assign the pointer */
2582: cusparsestruct->mat = matstruct;
2583: } catch (char *ex) {
2584: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2585: }
2586: PetscCallCUDA(WaitForCUDA());
2587: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2588: cusparsestruct->nonzerostate = A->nonzerostate;
2589: }
2590: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2591: }
2592: PetscFunctionReturn(PETSC_SUCCESS);
2593: }
2595: struct VecCUDAPlusEquals {
2596: template <typename Tuple>
2597: __host__ __device__ void operator()(Tuple t)
2598: {
2599: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2600: }
2601: };
2603: struct VecCUDAEquals {
2604: template <typename Tuple>
2605: __host__ __device__ void operator()(Tuple t)
2606: {
2607: thrust::get<1>(t) = thrust::get<0>(t);
2608: }
2609: };
2611: struct VecCUDAEqualsReverse {
2612: template <typename Tuple>
2613: __host__ __device__ void operator()(Tuple t)
2614: {
2615: thrust::get<0>(t) = thrust::get<1>(t);
2616: }
2617: };
2619: struct MatMatCusparse {
2620: PetscBool cisdense;
2621: PetscScalar *Bt;
2622: Mat X;
2623: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2624: PetscLogDouble flops;
2625: CsrMatrix *Bcsr;
2627: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2628: cusparseSpMatDescr_t matSpBDescr;
2629: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2630: cusparseDnMatDescr_t matBDescr;
2631: cusparseDnMatDescr_t matCDescr;
2632: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2633: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634: void *dBuffer4;
2635: void *dBuffer5;
2636: #endif
2637: size_t mmBufferSize;
2638: void *mmBuffer;
2639: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2640: cusparseSpGEMMDescr_t spgemmDesc;
2641: #endif
2642: };
2644: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2645: {
2646: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2648: PetscFunctionBegin;
2649: PetscCallCUDA(cudaFree(mmdata->Bt));
2650: delete mmdata->Bcsr;
2651: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2652: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2653: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2654: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2655: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2656: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2657: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2658: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2659: #endif
2660: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2661: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2662: #endif
2663: PetscCall(MatDestroy(&mmdata->X));
2664: PetscCall(PetscFree(data));
2665: PetscFunctionReturn(PETSC_SUCCESS);
2666: }
2668: #include <../src/mat/impls/dense/seq/dense.h>
2670: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2671: {
2672: Mat_Product *product = C->product;
2673: Mat A, B;
2674: PetscInt m, n, blda, clda;
2675: PetscBool flg, biscuda;
2676: Mat_SeqAIJCUSPARSE *cusp;
2677: cusparseStatus_t stat;
2678: cusparseOperation_t opA;
2679: const PetscScalar *barray;
2680: PetscScalar *carray;
2681: MatMatCusparse *mmdata;
2682: Mat_SeqAIJCUSPARSEMultStruct *mat;
2683: CsrMatrix *csrmat;
2685: PetscFunctionBegin;
2686: MatCheckProduct(C, 1);
2687: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2688: mmdata = (MatMatCusparse *)product->data;
2689: A = product->A;
2690: B = product->B;
2691: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2692: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2693: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2694: Instead of silently accepting the wrong answer, I prefer to raise the error */
2695: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2696: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2697: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2698: switch (product->type) {
2699: case MATPRODUCT_AB:
2700: case MATPRODUCT_PtAP:
2701: mat = cusp->mat;
2702: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2703: m = A->rmap->n;
2704: n = B->cmap->n;
2705: break;
2706: case MATPRODUCT_AtB:
2707: if (!A->form_explicit_transpose) {
2708: mat = cusp->mat;
2709: opA = CUSPARSE_OPERATION_TRANSPOSE;
2710: } else {
2711: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2712: mat = cusp->matTranspose;
2713: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2714: }
2715: m = A->cmap->n;
2716: n = B->cmap->n;
2717: break;
2718: case MATPRODUCT_ABt:
2719: case MATPRODUCT_RARt:
2720: mat = cusp->mat;
2721: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2722: m = A->rmap->n;
2723: n = B->rmap->n;
2724: break;
2725: default:
2726: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2727: }
2728: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2729: csrmat = (CsrMatrix *)mat->mat;
2730: /* if the user passed a CPU matrix, copy the data to the GPU */
2731: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2732: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2733: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2735: PetscCall(MatDenseGetLDA(B, &blda));
2736: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2737: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2738: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2739: } else {
2740: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2741: PetscCall(MatDenseGetLDA(C, &clda));
2742: }
2744: PetscCall(PetscLogGpuTimeBegin());
2745: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2746: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2747: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2748: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2749: #else
2750: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2751: #endif
2753: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2754: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2755: size_t mmBufferSize;
2756: if (mmdata->initialized && mmdata->Blda != blda) {
2757: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2758: mmdata->matBDescr = NULL;
2759: }
2760: if (!mmdata->matBDescr) {
2761: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2762: mmdata->Blda = blda;
2763: }
2765: if (mmdata->initialized && mmdata->Clda != clda) {
2766: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2767: mmdata->matCDescr = NULL;
2768: }
2769: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2770: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2771: mmdata->Clda = clda;
2772: }
2774: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2775: if (matADescr) {
2776: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2777: matADescr = NULL;
2778: }
2779: #endif
2781: if (!matADescr) {
2782: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2783: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2784: PetscCallCUSPARSE(stat);
2785: }
2787: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2789: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2790: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2791: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2792: mmdata->mmBufferSize = mmBufferSize;
2793: }
2795: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2796: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2797: #endif
2799: mmdata->initialized = PETSC_TRUE;
2800: } else {
2801: /* to be safe, always update pointers of the mats */
2802: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2803: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2804: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2805: }
2807: /* do cusparseSpMM, which supports transpose on B */
2808: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2809: #else
2810: PetscInt k;
2811: /* cusparseXcsrmm does not support transpose on B */
2812: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2813: cublasHandle_t cublasv2handle;
2814: cublasStatus_t cerr;
2816: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2817: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2818: PetscCallCUBLAS(cerr);
2819: blda = B->cmap->n;
2820: k = B->cmap->n;
2821: } else {
2822: k = B->rmap->n;
2823: }
2825: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2826: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2827: PetscCallCUSPARSE(stat);
2828: #endif
2829: PetscCall(PetscLogGpuTimeEnd());
2830: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2831: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2832: if (product->type == MATPRODUCT_RARt) {
2833: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2834: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2835: } else if (product->type == MATPRODUCT_PtAP) {
2836: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2837: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2838: } else {
2839: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2840: }
2841: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2842: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2843: PetscFunctionReturn(PETSC_SUCCESS);
2844: }
2846: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2847: {
2848: Mat_Product *product = C->product;
2849: Mat A, B;
2850: PetscInt m, n;
2851: PetscBool cisdense, flg;
2852: MatMatCusparse *mmdata;
2853: Mat_SeqAIJCUSPARSE *cusp;
2855: PetscFunctionBegin;
2856: MatCheckProduct(C, 1);
2857: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2858: A = product->A;
2859: B = product->B;
2860: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2861: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2862: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2863: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2864: switch (product->type) {
2865: case MATPRODUCT_AB:
2866: m = A->rmap->n;
2867: n = B->cmap->n;
2868: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2869: break;
2870: case MATPRODUCT_AtB:
2871: m = A->cmap->n;
2872: n = B->cmap->n;
2873: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2874: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2875: break;
2876: case MATPRODUCT_ABt:
2877: m = A->rmap->n;
2878: n = B->rmap->n;
2879: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2880: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2881: break;
2882: case MATPRODUCT_PtAP:
2883: m = B->cmap->n;
2884: n = B->cmap->n;
2885: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2886: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2887: break;
2888: case MATPRODUCT_RARt:
2889: m = B->rmap->n;
2890: n = B->rmap->n;
2891: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2892: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2893: break;
2894: default:
2895: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2896: }
2897: PetscCall(MatSetSizes(C, m, n, m, n));
2898: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2899: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2900: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2902: /* product data */
2903: PetscCall(PetscNew(&mmdata));
2904: mmdata->cisdense = cisdense;
2905: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2906: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2907: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2908: #endif
2909: /* for these products we need intermediate storage */
2910: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2911: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2912: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2913: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2914: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2915: } else {
2916: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2917: }
2918: }
2919: C->product->data = mmdata;
2920: C->product->destroy = MatDestroy_MatMatCusparse;
2922: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2923: PetscFunctionReturn(PETSC_SUCCESS);
2924: }
2926: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2927: {
2928: Mat_Product *product = C->product;
2929: Mat A, B;
2930: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2931: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2932: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2933: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2934: PetscBool flg;
2935: cusparseStatus_t stat;
2936: MatProductType ptype;
2937: MatMatCusparse *mmdata;
2938: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2939: cusparseSpMatDescr_t BmatSpDescr;
2940: #endif
2941: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2943: PetscFunctionBegin;
2944: MatCheckProduct(C, 1);
2945: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2946: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2947: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2948: mmdata = (MatMatCusparse *)C->product->data;
2949: A = product->A;
2950: B = product->B;
2951: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2952: mmdata->reusesym = PETSC_FALSE;
2953: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2954: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2955: Cmat = Ccusp->mat;
2956: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2957: Ccsr = (CsrMatrix *)Cmat->mat;
2958: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2959: goto finalize;
2960: }
2961: if (!c->nz) goto finalize;
2962: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2963: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2964: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2965: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2966: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2967: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2968: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2969: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2970: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2971: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2972: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2973: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2974: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2975: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2977: ptype = product->type;
2978: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2979: ptype = MATPRODUCT_AB;
2980: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2981: }
2982: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2983: ptype = MATPRODUCT_AB;
2984: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2985: }
2986: switch (ptype) {
2987: case MATPRODUCT_AB:
2988: Amat = Acusp->mat;
2989: Bmat = Bcusp->mat;
2990: break;
2991: case MATPRODUCT_AtB:
2992: Amat = Acusp->matTranspose;
2993: Bmat = Bcusp->mat;
2994: break;
2995: case MATPRODUCT_ABt:
2996: Amat = Acusp->mat;
2997: Bmat = Bcusp->matTranspose;
2998: break;
2999: default:
3000: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3001: }
3002: Cmat = Ccusp->mat;
3003: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3004: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3005: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3006: Acsr = (CsrMatrix *)Amat->mat;
3007: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3008: Ccsr = (CsrMatrix *)Cmat->mat;
3009: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3010: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3011: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
3012: PetscCall(PetscLogGpuTimeBegin());
3013: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3014: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3015: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3016: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3017: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3018: PetscCallCUSPARSE(stat);
3019: #else
3020: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3021: PetscCallCUSPARSE(stat);
3022: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3023: PetscCallCUSPARSE(stat);
3024: #endif
3025: #else
3026: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3027: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3028: PetscCallCUSPARSE(stat);
3029: #endif
3030: PetscCall(PetscLogGpuFlops(mmdata->flops));
3031: PetscCallCUDA(WaitForCUDA());
3032: PetscCall(PetscLogGpuTimeEnd());
3033: C->offloadmask = PETSC_OFFLOAD_GPU;
3034: finalize:
3035: /* shorter version of MatAssemblyEnd_SeqAIJ */
3036: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3037: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3038: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3039: c->reallocs = 0;
3040: C->info.mallocs += 0;
3041: C->info.nz_unneeded = 0;
3042: C->assembled = C->was_assembled = PETSC_TRUE;
3043: C->num_ass++;
3044: PetscFunctionReturn(PETSC_SUCCESS);
3045: }
3047: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3048: {
3049: Mat_Product *product = C->product;
3050: Mat A, B;
3051: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3052: Mat_SeqAIJ *a, *b, *c;
3053: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3054: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3055: PetscInt i, j, m, n, k;
3056: PetscBool flg;
3057: cusparseStatus_t stat;
3058: MatProductType ptype;
3059: MatMatCusparse *mmdata;
3060: PetscLogDouble flops;
3061: PetscBool biscompressed, ciscompressed;
3062: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3063: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3064: cusparseSpMatDescr_t BmatSpDescr;
3065: #else
3066: int cnz;
3067: #endif
3068: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3070: PetscFunctionBegin;
3071: MatCheckProduct(C, 1);
3072: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3073: A = product->A;
3074: B = product->B;
3075: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3076: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3077: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3078: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3079: a = (Mat_SeqAIJ *)A->data;
3080: b = (Mat_SeqAIJ *)B->data;
3081: /* product data */
3082: PetscCall(PetscNew(&mmdata));
3083: C->product->data = mmdata;
3084: C->product->destroy = MatDestroy_MatMatCusparse;
3086: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3087: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3088: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3089: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3090: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3091: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3093: ptype = product->type;
3094: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3095: ptype = MATPRODUCT_AB;
3096: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3097: }
3098: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3099: ptype = MATPRODUCT_AB;
3100: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3101: }
3102: biscompressed = PETSC_FALSE;
3103: ciscompressed = PETSC_FALSE;
3104: switch (ptype) {
3105: case MATPRODUCT_AB:
3106: m = A->rmap->n;
3107: n = B->cmap->n;
3108: k = A->cmap->n;
3109: Amat = Acusp->mat;
3110: Bmat = Bcusp->mat;
3111: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3112: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3113: break;
3114: case MATPRODUCT_AtB:
3115: m = A->cmap->n;
3116: n = B->cmap->n;
3117: k = A->rmap->n;
3118: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3119: Amat = Acusp->matTranspose;
3120: Bmat = Bcusp->mat;
3121: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3122: break;
3123: case MATPRODUCT_ABt:
3124: m = A->rmap->n;
3125: n = B->rmap->n;
3126: k = A->cmap->n;
3127: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3128: Amat = Acusp->mat;
3129: Bmat = Bcusp->matTranspose;
3130: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3131: break;
3132: default:
3133: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3134: }
3136: /* create cusparse matrix */
3137: PetscCall(MatSetSizes(C, m, n, m, n));
3138: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3139: c = (Mat_SeqAIJ *)C->data;
3140: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3141: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3142: Ccsr = new CsrMatrix;
3144: c->compressedrow.use = ciscompressed;
3145: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3146: c->compressedrow.nrows = a->compressedrow.nrows;
3147: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3148: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3149: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3150: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3151: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3152: } else {
3153: c->compressedrow.nrows = 0;
3154: c->compressedrow.i = NULL;
3155: c->compressedrow.rindex = NULL;
3156: Ccusp->workVector = NULL;
3157: Cmat->cprowIndices = NULL;
3158: }
3159: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3160: Ccusp->mat = Cmat;
3161: Ccusp->mat->mat = Ccsr;
3162: Ccsr->num_rows = Ccusp->nrows;
3163: Ccsr->num_cols = n;
3164: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3165: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3166: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3167: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3168: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3169: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3170: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3171: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3173: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3174: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3175: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3176: c->nz = 0;
3177: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3178: Ccsr->values = new THRUSTARRAY(c->nz);
3179: goto finalizesym;
3180: }
3182: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3183: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3184: Acsr = (CsrMatrix *)Amat->mat;
3185: if (!biscompressed) {
3186: Bcsr = (CsrMatrix *)Bmat->mat;
3187: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3188: BmatSpDescr = Bmat->matDescr;
3189: #endif
3190: } else { /* we need to use row offsets for the full matrix */
3191: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3192: Bcsr = new CsrMatrix;
3193: Bcsr->num_rows = B->rmap->n;
3194: Bcsr->num_cols = cBcsr->num_cols;
3195: Bcsr->num_entries = cBcsr->num_entries;
3196: Bcsr->column_indices = cBcsr->column_indices;
3197: Bcsr->values = cBcsr->values;
3198: if (!Bcusp->rowoffsets_gpu) {
3199: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3200: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3201: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3202: }
3203: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3204: mmdata->Bcsr = Bcsr;
3205: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3206: if (Bcsr->num_rows && Bcsr->num_cols) {
3207: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3208: PetscCallCUSPARSE(stat);
3209: }
3210: BmatSpDescr = mmdata->matSpBDescr;
3211: #endif
3212: }
3213: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3214: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3215: /* precompute flops count */
3216: if (ptype == MATPRODUCT_AB) {
3217: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3218: const PetscInt st = a->i[i];
3219: const PetscInt en = a->i[i + 1];
3220: for (j = st; j < en; j++) {
3221: const PetscInt brow = a->j[j];
3222: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3223: }
3224: }
3225: } else if (ptype == MATPRODUCT_AtB) {
3226: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3227: const PetscInt anzi = a->i[i + 1] - a->i[i];
3228: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3229: flops += (2. * anzi) * bnzi;
3230: }
3231: } else { /* TODO */
3232: flops = 0.;
3233: }
3235: mmdata->flops = flops;
3236: PetscCall(PetscLogGpuTimeBegin());
3238: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3239: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3240: // cuda-12.2 requires non-null csrRowOffsets
3241: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3242: PetscCallCUSPARSE(stat);
3243: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3244: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3245: {
3246: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3247: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3248: */
3249: void *dBuffer1 = NULL;
3250: void *dBuffer2 = NULL;
3251: void *dBuffer3 = NULL;
3252: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3253: size_t bufferSize1 = 0;
3254: size_t bufferSize2 = 0;
3255: size_t bufferSize3 = 0;
3256: size_t bufferSize4 = 0;
3257: size_t bufferSize5 = 0;
3259: /* ask bufferSize1 bytes for external memory */
3260: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3261: PetscCallCUSPARSE(stat);
3262: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3263: /* inspect the matrices A and B to understand the memory requirement for the next step */
3264: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3265: PetscCallCUSPARSE(stat);
3267: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3268: PetscCallCUSPARSE(stat);
3269: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3270: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3271: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3272: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3273: PetscCallCUSPARSE(stat);
3274: PetscCallCUDA(cudaFree(dBuffer1));
3275: PetscCallCUDA(cudaFree(dBuffer2));
3277: /* get matrix C non-zero entries C_nnz1 */
3278: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3279: c->nz = (PetscInt)C_nnz1;
3280: /* allocate matrix C */
3281: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3282: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283: Ccsr->values = new THRUSTARRAY(c->nz);
3284: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285: /* update matC with the new pointers */
3286: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3287: PetscCallCUSPARSE(stat);
3289: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3290: PetscCallCUSPARSE(stat);
3291: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3292: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3293: PetscCallCUSPARSE(stat);
3294: PetscCallCUDA(cudaFree(dBuffer3));
3295: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3296: PetscCallCUSPARSE(stat);
3297: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3298: }
3299: #else
3300: size_t bufSize2;
3301: /* ask bufferSize bytes for external memory */
3302: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3303: PetscCallCUSPARSE(stat);
3304: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3305: /* inspect the matrices A and B to understand the memory requirement for the next step */
3306: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3307: PetscCallCUSPARSE(stat);
3308: /* ask bufferSize again bytes for external memory */
3309: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3310: PetscCallCUSPARSE(stat);
3311: /* The CUSPARSE documentation is not clear, nor the API
3312: We need both buffers to perform the operations properly!
3313: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3314: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3315: is stored in the descriptor! What a messy API... */
3316: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3317: /* compute the intermediate product of A * B */
3318: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3319: PetscCallCUSPARSE(stat);
3320: /* get matrix C non-zero entries C_nnz1 */
3321: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3322: c->nz = (PetscInt)C_nnz1;
3323: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3324: mmdata->mmBufferSize / 1024));
3325: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3326: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3327: Ccsr->values = new THRUSTARRAY(c->nz);
3328: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3329: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3330: PetscCallCUSPARSE(stat);
3331: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3332: PetscCallCUSPARSE(stat);
3333: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3334: #else
3335: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3336: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3337: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3338: PetscCallCUSPARSE(stat);
3339: c->nz = cnz;
3340: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3341: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342: Ccsr->values = new THRUSTARRAY(c->nz);
3343: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3345: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3346: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3347: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3348: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3349: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3350: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3351: PetscCallCUSPARSE(stat);
3352: #endif
3353: PetscCall(PetscLogGpuFlops(mmdata->flops));
3354: PetscCall(PetscLogGpuTimeEnd());
3355: finalizesym:
3356: c->free_a = PETSC_TRUE;
3357: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3358: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3359: c->free_ij = PETSC_TRUE;
3360: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3361: PetscInt *d_i = c->i;
3362: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3363: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3364: ii = *Ccsr->row_offsets;
3365: jj = *Ccsr->column_indices;
3366: if (ciscompressed) d_i = c->compressedrow.i;
3367: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3368: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3369: } else {
3370: PetscInt *d_i = c->i;
3371: if (ciscompressed) d_i = c->compressedrow.i;
3372: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3373: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3374: }
3375: if (ciscompressed) { /* need to expand host row offsets */
3376: PetscInt r = 0;
3377: c->i[0] = 0;
3378: for (k = 0; k < c->compressedrow.nrows; k++) {
3379: const PetscInt next = c->compressedrow.rindex[k];
3380: const PetscInt old = c->compressedrow.i[k];
3381: for (; r < next; r++) c->i[r + 1] = old;
3382: }
3383: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3384: }
3385: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3386: PetscCall(PetscMalloc1(m, &c->ilen));
3387: PetscCall(PetscMalloc1(m, &c->imax));
3388: c->maxnz = c->nz;
3389: c->nonzerorowcnt = 0;
3390: c->rmax = 0;
3391: for (k = 0; k < m; k++) {
3392: const PetscInt nn = c->i[k + 1] - c->i[k];
3393: c->ilen[k] = c->imax[k] = nn;
3394: c->nonzerorowcnt += (PetscInt)!!nn;
3395: c->rmax = PetscMax(c->rmax, nn);
3396: }
3397: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3398: PetscCall(PetscMalloc1(c->nz, &c->a));
3399: Ccsr->num_entries = c->nz;
3401: C->nonzerostate++;
3402: PetscCall(PetscLayoutSetUp(C->rmap));
3403: PetscCall(PetscLayoutSetUp(C->cmap));
3404: Ccusp->nonzerostate = C->nonzerostate;
3405: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3406: C->preallocated = PETSC_TRUE;
3407: C->assembled = PETSC_FALSE;
3408: C->was_assembled = PETSC_FALSE;
3409: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3410: mmdata->reusesym = PETSC_TRUE;
3411: C->offloadmask = PETSC_OFFLOAD_GPU;
3412: }
3413: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3414: PetscFunctionReturn(PETSC_SUCCESS);
3415: }
3417: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3419: /* handles sparse or dense B */
3420: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3421: {
3422: Mat_Product *product = mat->product;
3423: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3425: PetscFunctionBegin;
3426: MatCheckProduct(mat, 1);
3427: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3428: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3429: if (product->type == MATPRODUCT_ABC) {
3430: Ciscusp = PETSC_FALSE;
3431: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3432: }
3433: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3434: PetscBool usecpu = PETSC_FALSE;
3435: switch (product->type) {
3436: case MATPRODUCT_AB:
3437: if (product->api_user) {
3438: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3439: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3440: PetscOptionsEnd();
3441: } else {
3442: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3443: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3444: PetscOptionsEnd();
3445: }
3446: break;
3447: case MATPRODUCT_AtB:
3448: if (product->api_user) {
3449: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3450: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3451: PetscOptionsEnd();
3452: } else {
3453: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3454: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3455: PetscOptionsEnd();
3456: }
3457: break;
3458: case MATPRODUCT_PtAP:
3459: if (product->api_user) {
3460: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3461: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3462: PetscOptionsEnd();
3463: } else {
3464: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3465: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3466: PetscOptionsEnd();
3467: }
3468: break;
3469: case MATPRODUCT_RARt:
3470: if (product->api_user) {
3471: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3472: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3473: PetscOptionsEnd();
3474: } else {
3475: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3476: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3477: PetscOptionsEnd();
3478: }
3479: break;
3480: case MATPRODUCT_ABC:
3481: if (product->api_user) {
3482: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3483: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3484: PetscOptionsEnd();
3485: } else {
3486: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3487: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3488: PetscOptionsEnd();
3489: }
3490: break;
3491: default:
3492: break;
3493: }
3494: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3495: }
3496: /* dispatch */
3497: if (isdense) {
3498: switch (product->type) {
3499: case MATPRODUCT_AB:
3500: case MATPRODUCT_AtB:
3501: case MATPRODUCT_ABt:
3502: case MATPRODUCT_PtAP:
3503: case MATPRODUCT_RARt:
3504: if (product->A->boundtocpu) {
3505: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3506: } else {
3507: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3508: }
3509: break;
3510: case MATPRODUCT_ABC:
3511: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3512: break;
3513: default:
3514: break;
3515: }
3516: } else if (Biscusp && Ciscusp) {
3517: switch (product->type) {
3518: case MATPRODUCT_AB:
3519: case MATPRODUCT_AtB:
3520: case MATPRODUCT_ABt:
3521: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3522: break;
3523: case MATPRODUCT_PtAP:
3524: case MATPRODUCT_RARt:
3525: case MATPRODUCT_ABC:
3526: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3527: break;
3528: default:
3529: break;
3530: }
3531: } else { /* fallback for AIJ */
3532: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3533: }
3534: PetscFunctionReturn(PETSC_SUCCESS);
3535: }
3537: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538: {
3539: PetscFunctionBegin;
3540: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3541: PetscFunctionReturn(PETSC_SUCCESS);
3542: }
3544: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3545: {
3546: PetscFunctionBegin;
3547: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3548: PetscFunctionReturn(PETSC_SUCCESS);
3549: }
3551: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3552: {
3553: PetscFunctionBegin;
3554: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3555: PetscFunctionReturn(PETSC_SUCCESS);
3556: }
3558: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3559: {
3560: PetscFunctionBegin;
3561: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3562: PetscFunctionReturn(PETSC_SUCCESS);
3563: }
3565: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3566: {
3567: PetscFunctionBegin;
3568: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3569: PetscFunctionReturn(PETSC_SUCCESS);
3570: }
3572: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3573: {
3574: int i = blockIdx.x * blockDim.x + threadIdx.x;
3575: if (i < n) y[idx[i]] += x[i];
3576: }
3578: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3579: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3580: {
3581: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3582: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3583: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3584: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3585: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3586: PetscBool compressed;
3587: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3588: PetscInt nx, ny;
3589: #endif
3591: PetscFunctionBegin;
3592: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3593: if (!a->nz) {
3594: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3595: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3596: PetscFunctionReturn(PETSC_SUCCESS);
3597: }
3598: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3599: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3600: if (!trans) {
3601: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3602: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3603: } else {
3604: if (herm || !A->form_explicit_transpose) {
3605: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3606: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3607: } else {
3608: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3609: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3610: }
3611: }
3612: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3613: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3615: try {
3616: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3617: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3618: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3620: PetscCall(PetscLogGpuTimeBegin());
3621: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3622: /* z = A x + beta y.
3623: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3624: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3625: */
3626: xptr = xarray;
3627: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3628: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3631: allocated to accommodate different uses. So we get the length info directly from mat.
3632: */
3633: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635: nx = mat->num_cols; // since y = Ax
3636: ny = mat->num_rows;
3637: }
3638: #endif
3639: } else {
3640: /* z = A^T x + beta y
3641: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3642: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3643: */
3644: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3645: dptr = zarray;
3646: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3647: if (compressed) { /* Scatter x to work vector */
3648: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3650: thrust::for_each(
3651: #if PetscDefined(HAVE_THRUST_ASYNC)
3652: thrust::cuda::par.on(PetscDefaultCudaStream),
3653: #endif
3654: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3655: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3656: }
3657: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3658: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3659: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3660: nx = mat->num_rows; // since y = A^T x
3661: ny = mat->num_cols;
3662: }
3663: #endif
3664: }
3666: /* csr_spmv does y = alpha op(A) x + beta y */
3667: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3668: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3670: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3671: #else
3672: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3673: #endif
3675: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3676: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3677: if (!matDescr) {
3678: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3679: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3680: }
3681: #endif
3683: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3684: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3685: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3686: PetscCallCUSPARSE(
3687: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3688: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3689: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3690: PetscCallCUSPARSE(
3691: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3692: #endif
3693: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3694: } else {
3695: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3696: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3697: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3698: }
3700: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3701: #else
3702: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3703: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3704: #endif
3705: } else {
3706: if (cusparsestruct->nrows) {
3707: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3708: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3709: #else
3710: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3711: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3712: #endif
3713: }
3714: }
3715: PetscCall(PetscLogGpuTimeEnd());
3717: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3718: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3719: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3720: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3721: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3722: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3723: }
3724: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3725: PetscCall(VecSeq_CUDA::Set(zz, 0));
3726: }
3728: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3729: if (compressed) {
3730: PetscCall(PetscLogGpuTimeBegin());
3731: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3732: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3733: PetscCall(PetscLogGpuTimeEnd());
3734: }
3735: } else {
3736: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3737: }
3738: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3739: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3740: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3741: } catch (char *ex) {
3742: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3743: }
3744: if (yy) {
3745: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3746: } else {
3747: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3748: }
3749: PetscFunctionReturn(PETSC_SUCCESS);
3750: }
3752: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3753: {
3754: PetscFunctionBegin;
3755: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3756: PetscFunctionReturn(PETSC_SUCCESS);
3757: }
3759: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3760: {
3761: PetscFunctionBegin;
3762: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3763: PetscFunctionReturn(PETSC_SUCCESS);
3764: }
3766: /*@
3767: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3768: (the default parallel PETSc format).
3770: Collective
3772: Input Parameters:
3773: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3774: . m - number of rows
3775: . n - number of columns
3776: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3777: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3779: Output Parameter:
3780: . A - the matrix
3782: Level: intermediate
3784: Notes:
3785: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3786: calculations. For good matrix assembly performance the user should preallocate the matrix
3787: storage by setting the parameter `nz` (or the array `nnz`).
3789: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3790: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3791: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3793: The AIJ format, also called
3794: compressed row storage, is fully compatible with standard Fortran
3795: storage. That is, the stored row and column indices can begin at
3796: either one (as in Fortran) or zero.
3798: Specify the preallocated storage with either nz or nnz (not both).
3799: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3800: allocation.
3802: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
3803: @*/
3804: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3805: {
3806: PetscFunctionBegin;
3807: PetscCall(MatCreate(comm, A));
3808: PetscCall(MatSetSizes(*A, m, n, m, n));
3809: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3810: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3811: PetscFunctionReturn(PETSC_SUCCESS);
3812: }
3814: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3815: {
3816: PetscFunctionBegin;
3817: if (A->factortype == MAT_FACTOR_NONE) {
3818: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3819: } else {
3820: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3821: }
3822: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3823: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3824: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3825: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3826: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3827: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3828: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3829: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3830: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3831: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3832: PetscCall(MatDestroy_SeqAIJ(A));
3833: PetscFunctionReturn(PETSC_SUCCESS);
3834: }
3836: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3837: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3838: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3839: {
3840: PetscFunctionBegin;
3841: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3842: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3843: PetscFunctionReturn(PETSC_SUCCESS);
3844: }
3846: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3847: {
3848: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3849: Mat_SeqAIJCUSPARSE *cy;
3850: Mat_SeqAIJCUSPARSE *cx;
3851: PetscScalar *ay;
3852: const PetscScalar *ax;
3853: CsrMatrix *csry, *csrx;
3855: PetscFunctionBegin;
3856: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3857: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3858: if (X->ops->axpy != Y->ops->axpy) {
3859: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3860: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3861: PetscFunctionReturn(PETSC_SUCCESS);
3862: }
3863: /* if we are here, it means both matrices are bound to GPU */
3864: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3865: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3866: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3867: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3868: csry = (CsrMatrix *)cy->mat->mat;
3869: csrx = (CsrMatrix *)cx->mat->mat;
3870: /* see if we can turn this into a cublas axpy */
3871: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3872: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3873: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3874: if (eq) str = SAME_NONZERO_PATTERN;
3875: }
3876: /* spgeam is buggy with one column */
3877: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3879: if (str == SUBSET_NONZERO_PATTERN) {
3880: PetscScalar b = 1.0;
3881: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3882: size_t bufferSize;
3883: void *buffer;
3884: #endif
3886: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3887: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3888: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3889: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3890: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3891: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3892: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3893: PetscCall(PetscLogGpuTimeBegin());
3894: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3895: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3896: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3897: PetscCall(PetscLogGpuTimeEnd());
3898: PetscCallCUDA(cudaFree(buffer));
3899: #else
3900: PetscCall(PetscLogGpuTimeBegin());
3901: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3902: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3903: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3904: PetscCall(PetscLogGpuTimeEnd());
3905: #endif
3906: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3907: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3908: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3909: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3910: } else if (str == SAME_NONZERO_PATTERN) {
3911: cublasHandle_t cublasv2handle;
3912: PetscBLASInt one = 1, bnz = 1;
3914: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3915: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3916: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3917: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3918: PetscCall(PetscLogGpuTimeBegin());
3919: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3920: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3921: PetscCall(PetscLogGpuTimeEnd());
3922: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3923: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3924: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3925: } else {
3926: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3927: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3928: }
3929: PetscFunctionReturn(PETSC_SUCCESS);
3930: }
3932: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3933: {
3934: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3935: PetscScalar *ay;
3936: cublasHandle_t cublasv2handle;
3937: PetscBLASInt one = 1, bnz = 1;
3939: PetscFunctionBegin;
3940: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3941: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3942: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3943: PetscCall(PetscLogGpuTimeBegin());
3944: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3945: PetscCall(PetscLogGpuFlops(bnz));
3946: PetscCall(PetscLogGpuTimeEnd());
3947: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3948: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3949: PetscFunctionReturn(PETSC_SUCCESS);
3950: }
3952: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3953: {
3954: PetscBool both = PETSC_FALSE;
3955: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3957: PetscFunctionBegin;
3958: if (A->factortype == MAT_FACTOR_NONE) {
3959: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3960: if (spptr->mat) {
3961: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3962: if (matrix->values) {
3963: both = PETSC_TRUE;
3964: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3965: }
3966: }
3967: if (spptr->matTranspose) {
3968: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3969: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3970: }
3971: }
3972: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3973: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3974: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3975: else A->offloadmask = PETSC_OFFLOAD_CPU;
3976: PetscFunctionReturn(PETSC_SUCCESS);
3977: }
3979: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3980: {
3981: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3983: PetscFunctionBegin;
3984: if (A->factortype != MAT_FACTOR_NONE) {
3985: A->boundtocpu = flg;
3986: PetscFunctionReturn(PETSC_SUCCESS);
3987: }
3988: if (flg) {
3989: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3991: A->ops->scale = MatScale_SeqAIJ;
3992: A->ops->axpy = MatAXPY_SeqAIJ;
3993: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3994: A->ops->mult = MatMult_SeqAIJ;
3995: A->ops->multadd = MatMultAdd_SeqAIJ;
3996: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3997: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3998: A->ops->multhermitiantranspose = NULL;
3999: A->ops->multhermitiantransposeadd = NULL;
4000: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4001: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4002: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4003: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4004: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4005: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4006: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4007: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4008: } else {
4009: A->ops->scale = MatScale_SeqAIJCUSPARSE;
4010: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4011: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4012: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4013: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4014: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4015: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4016: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4017: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4018: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4019: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4020: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4021: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4022: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4023: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4024: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4025: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4027: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4028: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4029: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4030: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4031: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4032: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4033: }
4034: A->boundtocpu = flg;
4035: if (flg && a->inode.size_csr) {
4036: a->inode.use = PETSC_TRUE;
4037: } else {
4038: a->inode.use = PETSC_FALSE;
4039: }
4040: PetscFunctionReturn(PETSC_SUCCESS);
4041: }
4043: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4044: {
4045: Mat B;
4047: PetscFunctionBegin;
4048: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4049: if (reuse == MAT_INITIAL_MATRIX) {
4050: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4051: } else if (reuse == MAT_REUSE_MATRIX) {
4052: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4053: }
4054: B = *newmat;
4056: PetscCall(PetscFree(B->defaultvectype));
4057: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4059: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4060: if (B->factortype == MAT_FACTOR_NONE) {
4061: Mat_SeqAIJCUSPARSE *spptr;
4062: PetscCall(PetscNew(&spptr));
4063: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4064: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4065: spptr->format = MAT_CUSPARSE_CSR;
4066: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4067: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4068: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4069: #else
4070: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4071: #endif
4072: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4073: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4074: #endif
4075: B->spptr = spptr;
4076: } else {
4077: Mat_SeqAIJCUSPARSETriFactors *spptr;
4079: PetscCall(PetscNew(&spptr));
4080: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4081: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4082: B->spptr = spptr;
4083: }
4084: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4085: }
4086: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4087: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4088: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4089: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4090: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4091: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4093: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4094: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4095: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4096: #if defined(PETSC_HAVE_HYPRE)
4097: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4098: #endif
4099: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4100: PetscFunctionReturn(PETSC_SUCCESS);
4101: }
4103: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4104: {
4105: PetscFunctionBegin;
4106: PetscCall(MatCreate_SeqAIJ(B));
4107: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4108: PetscFunctionReturn(PETSC_SUCCESS);
4109: }
4111: /*MC
4112: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4114: A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
4115: CSR, ELL, or Hybrid format.
4116: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4118: Options Database Keys:
4119: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4120: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4121: Other options include ell (ellpack) or hyb (hybrid).
4122: . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4123: - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4125: Level: beginner
4127: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4128: M*/
4130: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4131: {
4132: PetscFunctionBegin;
4133: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4134: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4135: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4136: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4137: PetscFunctionReturn(PETSC_SUCCESS);
4138: }
4140: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4141: {
4142: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4144: PetscFunctionBegin;
4145: if (cusp) {
4146: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4147: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4148: delete cusp->workVector;
4149: delete cusp->rowoffsets_gpu;
4150: delete cusp->csr2csc_i;
4151: delete cusp->coords;
4152: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4153: PetscCall(PetscFree(mat->spptr));
4154: }
4155: PetscFunctionReturn(PETSC_SUCCESS);
4156: }
4158: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4159: {
4160: PetscFunctionBegin;
4161: if (*mat) {
4162: delete (*mat)->values;
4163: delete (*mat)->column_indices;
4164: delete (*mat)->row_offsets;
4165: delete *mat;
4166: *mat = 0;
4167: }
4168: PetscFunctionReturn(PETSC_SUCCESS);
4169: }
4171: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4172: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4173: {
4174: PetscFunctionBegin;
4175: if (*trifactor) {
4176: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4177: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4178: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4179: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4180: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4181: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4182: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4183: #endif
4184: PetscCall(PetscFree(*trifactor));
4185: }
4186: PetscFunctionReturn(PETSC_SUCCESS);
4187: }
4188: #endif
4190: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4191: {
4192: CsrMatrix *mat;
4194: PetscFunctionBegin;
4195: if (*matstruct) {
4196: if ((*matstruct)->mat) {
4197: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4198: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4199: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4200: #else
4201: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4202: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4203: #endif
4204: } else {
4205: mat = (CsrMatrix *)(*matstruct)->mat;
4206: PetscCall(CsrMatrix_Destroy(&mat));
4207: }
4208: }
4209: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4210: delete (*matstruct)->cprowIndices;
4211: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4212: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4213: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4215: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4216: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4217: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4219: for (int i = 0; i < 3; i++) {
4220: if (mdata->cuSpMV[i].initialized) {
4221: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4222: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4223: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4224: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4225: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4226: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4227: #endif
4228: }
4229: }
4230: #endif
4231: delete *matstruct;
4232: *matstruct = NULL;
4233: }
4234: PetscFunctionReturn(PETSC_SUCCESS);
4235: }
4237: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4238: {
4239: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4241: PetscFunctionBegin;
4242: if (fs) {
4243: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4244: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4245: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4246: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4247: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4248: delete fs->workVector;
4249: fs->workVector = NULL;
4250: #endif
4251: delete fs->rpermIndices;
4252: delete fs->cpermIndices;
4253: fs->rpermIndices = NULL;
4254: fs->cpermIndices = NULL;
4255: fs->init_dev_prop = PETSC_FALSE;
4256: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4257: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4258: PetscCallCUDA(cudaFree(fs->csrColIdx));
4259: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4260: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4261: PetscCallCUDA(cudaFree(fs->csrVal));
4262: PetscCallCUDA(cudaFree(fs->diag));
4263: PetscCallCUDA(cudaFree(fs->X));
4264: PetscCallCUDA(cudaFree(fs->Y));
4265: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4266: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4267: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4268: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4269: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4270: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4271: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4272: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4273: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4274: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4275: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4276: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4277: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4278: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4279: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4280: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4281: PetscCall(PetscFree(fs->csrRowPtr_h));
4282: PetscCall(PetscFree(fs->csrVal_h));
4283: PetscCall(PetscFree(fs->diag_h));
4284: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4285: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4286: #endif
4287: }
4288: PetscFunctionReturn(PETSC_SUCCESS);
4289: }
4291: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4292: {
4293: PetscFunctionBegin;
4294: if (*trifactors) {
4295: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4296: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4297: PetscCall(PetscFree(*trifactors));
4298: }
4299: PetscFunctionReturn(PETSC_SUCCESS);
4300: }
4302: struct IJCompare {
4303: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4304: {
4305: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4306: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4307: return false;
4308: }
4309: };
4311: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4312: {
4313: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4315: PetscFunctionBegin;
4316: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4317: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4318: if (destroy) {
4319: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4320: delete cusp->csr2csc_i;
4321: cusp->csr2csc_i = NULL;
4322: }
4323: A->transupdated = PETSC_FALSE;
4324: PetscFunctionReturn(PETSC_SUCCESS);
4325: }
4327: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4328: {
4329: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4331: PetscFunctionBegin;
4332: PetscCallCUDA(cudaFree(coo->perm));
4333: PetscCallCUDA(cudaFree(coo->jmap));
4334: PetscCall(PetscFree(coo));
4335: PetscFunctionReturn(PETSC_SUCCESS);
4336: }
4338: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4339: {
4340: PetscBool dev_ij = PETSC_FALSE;
4341: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4342: PetscInt *i, *j;
4343: PetscContainer container_h;
4344: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4346: PetscFunctionBegin;
4347: PetscCall(PetscGetMemType(coo_i, &mtype));
4348: if (PetscMemTypeDevice(mtype)) {
4349: dev_ij = PETSC_TRUE;
4350: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4351: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4352: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4353: } else {
4354: i = coo_i;
4355: j = coo_j;
4356: }
4358: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4359: if (dev_ij) PetscCall(PetscFree2(i, j));
4360: mat->offloadmask = PETSC_OFFLOAD_CPU;
4361: // Create the GPU memory
4362: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4364: // Copy the COO struct to device
4365: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4366: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4367: PetscCall(PetscMalloc1(1, &coo_d));
4368: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4369: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4370: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4371: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4372: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4374: // Put the COO struct in a container and then attach that to the matrix
4375: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4376: PetscFunctionReturn(PETSC_SUCCESS);
4377: }
4379: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4380: {
4381: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4382: const PetscCount grid_size = gridDim.x * blockDim.x;
4383: for (; i < nnz; i += grid_size) {
4384: PetscScalar sum = 0.0;
4385: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4386: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4387: }
4388: }
4390: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4391: {
4392: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4393: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4394: PetscCount Annz = seq->nz;
4395: PetscMemType memtype;
4396: const PetscScalar *v1 = v;
4397: PetscScalar *Aa;
4398: PetscContainer container;
4399: MatCOOStruct_SeqAIJ *coo;
4401: PetscFunctionBegin;
4402: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4404: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4405: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4407: PetscCall(PetscGetMemType(v, &memtype));
4408: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4409: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4410: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4411: }
4413: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4414: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4416: PetscCall(PetscLogGpuTimeBegin());
4417: if (Annz) {
4418: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4419: PetscCallCUDA(cudaPeekAtLastError());
4420: }
4421: PetscCall(PetscLogGpuTimeEnd());
4423: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4424: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4426: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4427: PetscFunctionReturn(PETSC_SUCCESS);
4428: }
4430: /*@C
4431: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4433: Not Collective
4435: Input Parameters:
4436: + A - the matrix
4437: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4439: Output Parameters:
4440: + i - the CSR row pointers
4441: - j - the CSR column indices
4443: Level: developer
4445: Note:
4446: When compressed is true, the CSR structure does not contain empty rows
4448: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4449: @*/
4450: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4451: {
4452: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4453: CsrMatrix *csr;
4454: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4456: PetscFunctionBegin;
4458: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4459: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4460: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4461: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4462: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4463: csr = (CsrMatrix *)cusp->mat->mat;
4464: if (i) {
4465: if (!compressed && a->compressedrow.use) { /* need full row offset */
4466: if (!cusp->rowoffsets_gpu) {
4467: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4468: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4469: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4470: }
4471: *i = cusp->rowoffsets_gpu->data().get();
4472: } else *i = csr->row_offsets->data().get();
4473: }
4474: if (j) *j = csr->column_indices->data().get();
4475: PetscFunctionReturn(PETSC_SUCCESS);
4476: }
4478: /*@C
4479: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4481: Not Collective
4483: Input Parameters:
4484: + A - the matrix
4485: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4486: . i - the CSR row pointers
4487: - j - the CSR column indices
4489: Level: developer
4491: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4492: @*/
4493: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4494: {
4495: PetscFunctionBegin;
4497: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4498: if (i) *i = NULL;
4499: if (j) *j = NULL;
4500: (void)compressed;
4501: PetscFunctionReturn(PETSC_SUCCESS);
4502: }
4504: /*@C
4505: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4507: Not Collective
4509: Input Parameter:
4510: . A - a `MATSEQAIJCUSPARSE` matrix
4512: Output Parameter:
4513: . a - pointer to the device data
4515: Level: developer
4517: Note:
4518: May trigger host-device copies if up-to-date matrix data is on host
4520: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4521: @*/
4522: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4523: {
4524: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4525: CsrMatrix *csr;
4527: PetscFunctionBegin;
4529: PetscAssertPointer(a, 2);
4530: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4531: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4532: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4533: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534: csr = (CsrMatrix *)cusp->mat->mat;
4535: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536: *a = csr->values->data().get();
4537: PetscFunctionReturn(PETSC_SUCCESS);
4538: }
4540: /*@C
4541: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4543: Not Collective
4545: Input Parameters:
4546: + A - a `MATSEQAIJCUSPARSE` matrix
4547: - a - pointer to the device data
4549: Level: developer
4551: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4552: @*/
4553: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554: {
4555: PetscFunctionBegin;
4557: PetscAssertPointer(a, 2);
4558: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559: *a = NULL;
4560: PetscFunctionReturn(PETSC_SUCCESS);
4561: }
4563: /*@C
4564: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4566: Not Collective
4568: Input Parameter:
4569: . A - a `MATSEQAIJCUSPARSE` matrix
4571: Output Parameter:
4572: . a - pointer to the device data
4574: Level: developer
4576: Note:
4577: May trigger host-device copies if up-to-date matrix data is on host
4579: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4580: @*/
4581: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582: {
4583: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584: CsrMatrix *csr;
4586: PetscFunctionBegin;
4588: PetscAssertPointer(a, 2);
4589: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4592: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593: csr = (CsrMatrix *)cusp->mat->mat;
4594: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595: *a = csr->values->data().get();
4596: A->offloadmask = PETSC_OFFLOAD_GPU;
4597: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598: PetscFunctionReturn(PETSC_SUCCESS);
4599: }
4600: /*@C
4601: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4603: Not Collective
4605: Input Parameters:
4606: + A - a `MATSEQAIJCUSPARSE` matrix
4607: - a - pointer to the device data
4609: Level: developer
4611: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4612: @*/
4613: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4614: {
4615: PetscFunctionBegin;
4617: PetscAssertPointer(a, 2);
4618: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4619: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4620: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4621: *a = NULL;
4622: PetscFunctionReturn(PETSC_SUCCESS);
4623: }
4625: /*@C
4626: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4628: Not Collective
4630: Input Parameter:
4631: . A - a `MATSEQAIJCUSPARSE` matrix
4633: Output Parameter:
4634: . a - pointer to the device data
4636: Level: developer
4638: Note:
4639: Does not trigger host-device copies and flags data validity on the GPU
4641: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4642: @*/
4643: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4644: {
4645: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4646: CsrMatrix *csr;
4648: PetscFunctionBegin;
4650: PetscAssertPointer(a, 2);
4651: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4652: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4653: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4654: csr = (CsrMatrix *)cusp->mat->mat;
4655: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4656: *a = csr->values->data().get();
4657: A->offloadmask = PETSC_OFFLOAD_GPU;
4658: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4659: PetscFunctionReturn(PETSC_SUCCESS);
4660: }
4662: /*@C
4663: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4665: Not Collective
4667: Input Parameters:
4668: + A - a `MATSEQAIJCUSPARSE` matrix
4669: - a - pointer to the device data
4671: Level: developer
4673: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4674: @*/
4675: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4676: {
4677: PetscFunctionBegin;
4679: PetscAssertPointer(a, 2);
4680: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4681: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4682: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4683: *a = NULL;
4684: PetscFunctionReturn(PETSC_SUCCESS);
4685: }
4687: struct IJCompare4 {
4688: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4689: {
4690: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4691: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4692: return false;
4693: }
4694: };
4696: struct Shift {
4697: int _shift;
4699: Shift(int shift) : _shift(shift) { }
4700: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4701: };
4703: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4704: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4705: {
4706: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4707: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4708: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4709: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4710: PetscInt Annz, Bnnz;
4711: cusparseStatus_t stat;
4712: PetscInt i, m, n, zero = 0;
4714: PetscFunctionBegin;
4717: PetscAssertPointer(C, 4);
4718: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4719: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4720: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4721: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4722: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4723: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4724: if (reuse == MAT_INITIAL_MATRIX) {
4725: m = A->rmap->n;
4726: n = A->cmap->n + B->cmap->n;
4727: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4728: PetscCall(MatSetSizes(*C, m, n, m, n));
4729: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4730: c = (Mat_SeqAIJ *)(*C)->data;
4731: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4732: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4733: Ccsr = new CsrMatrix;
4734: Cmat->cprowIndices = NULL;
4735: c->compressedrow.use = PETSC_FALSE;
4736: c->compressedrow.nrows = 0;
4737: c->compressedrow.i = NULL;
4738: c->compressedrow.rindex = NULL;
4739: Ccusp->workVector = NULL;
4740: Ccusp->nrows = m;
4741: Ccusp->mat = Cmat;
4742: Ccusp->mat->mat = Ccsr;
4743: Ccsr->num_rows = m;
4744: Ccsr->num_cols = n;
4745: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4746: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4747: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4748: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4749: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4750: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4751: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4752: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4753: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4754: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4755: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4756: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4757: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4759: Acsr = (CsrMatrix *)Acusp->mat->mat;
4760: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4761: Annz = (PetscInt)Acsr->column_indices->size();
4762: Bnnz = (PetscInt)Bcsr->column_indices->size();
4763: c->nz = Annz + Bnnz;
4764: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4765: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4766: Ccsr->values = new THRUSTARRAY(c->nz);
4767: Ccsr->num_entries = c->nz;
4768: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4769: if (c->nz) {
4770: auto Acoo = new THRUSTINTARRAY32(Annz);
4771: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4772: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4773: THRUSTINTARRAY32 *Aroff, *Broff;
4775: if (a->compressedrow.use) { /* need full row offset */
4776: if (!Acusp->rowoffsets_gpu) {
4777: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4778: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4779: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4780: }
4781: Aroff = Acusp->rowoffsets_gpu;
4782: } else Aroff = Acsr->row_offsets;
4783: if (b->compressedrow.use) { /* need full row offset */
4784: if (!Bcusp->rowoffsets_gpu) {
4785: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4786: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4787: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4788: }
4789: Broff = Bcusp->rowoffsets_gpu;
4790: } else Broff = Bcsr->row_offsets;
4791: PetscCall(PetscLogGpuTimeBegin());
4792: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4793: PetscCallCUSPARSE(stat);
4794: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4795: PetscCallCUSPARSE(stat);
4796: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4797: auto Aperm = thrust::make_constant_iterator(1);
4798: auto Bperm = thrust::make_constant_iterator(0);
4799: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4800: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4801: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4802: #else
4803: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4804: auto Bcib = Bcsr->column_indices->begin();
4805: auto Bcie = Bcsr->column_indices->end();
4806: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4807: #endif
4808: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4809: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4810: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4811: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4812: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4813: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4814: auto p1 = Ccusp->coords->begin();
4815: auto p2 = Ccusp->coords->begin();
4816: thrust::advance(p2, Annz);
4817: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4818: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4819: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4820: #endif
4821: auto cci = thrust::make_counting_iterator(zero);
4822: auto cce = thrust::make_counting_iterator(c->nz);
4823: #if 0 //Errors on SUMMIT cuda 11.1.0
4824: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4825: #else
4826: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4827: auto pred = thrust::identity<int>();
4828: #else
4829: auto pred = cuda::std::identity();
4830: #endif
4831: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4833: #endif
4834: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4835: PetscCallCUSPARSE(stat);
4836: PetscCall(PetscLogGpuTimeEnd());
4837: delete wPerm;
4838: delete Acoo;
4839: delete Bcoo;
4840: delete Ccoo;
4841: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4842: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4843: PetscCallCUSPARSE(stat);
4844: #endif
4845: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4846: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4847: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850: CsrMatrix *CcsrT = new CsrMatrix;
4851: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4854: (*C)->form_explicit_transpose = PETSC_TRUE;
4855: (*C)->transupdated = PETSC_TRUE;
4856: Ccusp->rowoffsets_gpu = NULL;
4857: CmatT->cprowIndices = NULL;
4858: CmatT->mat = CcsrT;
4859: CcsrT->num_rows = n;
4860: CcsrT->num_cols = m;
4861: CcsrT->num_entries = c->nz;
4863: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4864: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865: CcsrT->values = new THRUSTARRAY(c->nz);
4867: PetscCall(PetscLogGpuTimeBegin());
4868: auto rT = CcsrT->row_offsets->begin();
4869: if (AT) {
4870: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871: thrust::advance(rT, -1);
4872: }
4873: if (BT) {
4874: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876: thrust::copy(titb, tite, rT);
4877: }
4878: auto cT = CcsrT->column_indices->begin();
4879: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881: auto vT = CcsrT->values->begin();
4882: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4884: PetscCall(PetscLogGpuTimeEnd());
4886: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4887: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4888: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4889: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4890: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4891: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4892: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4893: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4894: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4896: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4897: PetscCallCUSPARSE(stat);
4898: #endif
4899: Ccusp->matTranspose = CmatT;
4900: }
4901: }
4903: c->free_a = PETSC_TRUE;
4904: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4905: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4906: c->free_ij = PETSC_TRUE;
4907: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4908: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4909: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4910: ii = *Ccsr->row_offsets;
4911: jj = *Ccsr->column_indices;
4912: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4913: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4914: } else {
4915: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4916: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917: }
4918: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4919: PetscCall(PetscMalloc1(m, &c->ilen));
4920: PetscCall(PetscMalloc1(m, &c->imax));
4921: c->maxnz = c->nz;
4922: c->nonzerorowcnt = 0;
4923: c->rmax = 0;
4924: for (i = 0; i < m; i++) {
4925: const PetscInt nn = c->i[i + 1] - c->i[i];
4926: c->ilen[i] = c->imax[i] = nn;
4927: c->nonzerorowcnt += (PetscInt)!!nn;
4928: c->rmax = PetscMax(c->rmax, nn);
4929: }
4930: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4931: PetscCall(PetscMalloc1(c->nz, &c->a));
4932: (*C)->nonzerostate++;
4933: PetscCall(PetscLayoutSetUp((*C)->rmap));
4934: PetscCall(PetscLayoutSetUp((*C)->cmap));
4935: Ccusp->nonzerostate = (*C)->nonzerostate;
4936: (*C)->preallocated = PETSC_TRUE;
4937: } else {
4938: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4939: c = (Mat_SeqAIJ *)(*C)->data;
4940: if (c->nz) {
4941: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4942: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4943: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4944: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4945: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4946: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4947: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4948: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4949: Acsr = (CsrMatrix *)Acusp->mat->mat;
4950: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4951: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4952: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4953: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4954: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4955: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4956: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4957: auto pmid = Ccusp->coords->begin();
4958: thrust::advance(pmid, Acsr->num_entries);
4959: PetscCall(PetscLogGpuTimeBegin());
4960: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4961: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4962: thrust::for_each(zibait, zieait, VecCUDAEquals());
4963: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4964: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4965: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4966: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4967: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4968: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4969: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4970: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4971: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4972: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4973: auto vT = CcsrT->values->begin();
4974: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4975: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4976: (*C)->transupdated = PETSC_TRUE;
4977: }
4978: PetscCall(PetscLogGpuTimeEnd());
4979: }
4980: }
4981: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4982: (*C)->assembled = PETSC_TRUE;
4983: (*C)->was_assembled = PETSC_FALSE;
4984: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4985: PetscFunctionReturn(PETSC_SUCCESS);
4986: }
4988: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4989: {
4990: bool dmem;
4991: const PetscScalar *av;
4993: PetscFunctionBegin;
4994: dmem = isCudaMem(v);
4995: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4996: if (n && idx) {
4997: THRUSTINTARRAY widx(n);
4998: widx.assign(idx, idx + n);
4999: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5001: THRUSTARRAY *w = NULL;
5002: thrust::device_ptr<PetscScalar> dv;
5003: if (dmem) {
5004: dv = thrust::device_pointer_cast(v);
5005: } else {
5006: w = new THRUSTARRAY(n);
5007: dv = w->data();
5008: }
5009: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5011: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5012: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5013: thrust::for_each(zibit, zieit, VecCUDAEquals());
5014: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5015: delete w;
5016: } else {
5017: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5018: }
5019: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5020: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5021: PetscFunctionReturn(PETSC_SUCCESS);
5022: }
5023: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()