Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #endif
19: #include <thrust/iterator/constant_iterator.h>
20: #include <thrust/remove.h>
21: #include <thrust/sort.h>
22: #include <thrust/unique.h>
23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24: #include <cuda/std/functional>
25: #endif
27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29: /*
30: The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32: */
33: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36: #endif
38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48: #endif
49: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat, Vec, Vec);
53: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
54: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
55: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
56: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
57: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
58: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
59: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
61: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
62: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
63: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
64: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
66: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
67: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
69: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
70: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
71: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
73: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
74: {
75: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
77: PetscFunctionBegin;
78: switch (op) {
79: case MAT_CUSPARSE_MULT:
80: cusparsestruct->format = format;
81: break;
82: case MAT_CUSPARSE_ALL:
83: cusparsestruct->format = format;
84: break;
85: default:
86: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
87: }
88: PetscFunctionReturn(PETSC_SUCCESS);
89: }
91: /*@
92: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
93: operation. Only the `MatMult()` operation can use different GPU storage formats
95: Not Collective
97: Input Parameters:
98: + A - Matrix of type `MATSEQAIJCUSPARSE`
99: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
100: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
101: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
103: Level: intermediate
105: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
106: @*/
107: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
108: {
109: PetscFunctionBegin;
111: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
116: {
117: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
119: PetscFunctionBegin;
120: cusparsestruct->use_cpu_solve = use_cpu;
121: PetscFunctionReturn(PETSC_SUCCESS);
122: }
124: /*@
125: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
127: Input Parameters:
128: + A - Matrix of type `MATSEQAIJCUSPARSE`
129: - use_cpu - set flag for using the built-in CPU `MatSolve()`
131: Level: intermediate
133: Note:
134: The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
135: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
136: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
138: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
139: @*/
140: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
141: {
142: PetscFunctionBegin;
144: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
145: PetscFunctionReturn(PETSC_SUCCESS);
146: }
148: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
149: {
150: PetscFunctionBegin;
151: switch (op) {
152: case MAT_FORM_EXPLICIT_TRANSPOSE:
153: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
154: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
155: A->form_explicit_transpose = flg;
156: break;
157: default:
158: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
159: break;
160: }
161: PetscFunctionReturn(PETSC_SUCCESS);
162: }
164: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
165: {
166: MatCUSPARSEStorageFormat format;
167: PetscBool flg;
168: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
170: PetscFunctionBegin;
171: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
172: if (A->factortype == MAT_FACTOR_NONE) {
173: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
174: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
176: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
177: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
178: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
179: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
180: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
181: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
182: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
183: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
184: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
185: #else
186: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
187: #endif
188: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
189: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
191: PetscCall(
192: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
193: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
194: #endif
195: }
196: PetscOptionsHeadEnd();
197: PetscFunctionReturn(PETSC_SUCCESS);
198: }
200: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
201: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
202: {
203: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
204: PetscInt m = A->rmap->n;
205: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
206: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
207: const MatScalar *Aa = a->a;
208: PetscInt *Mi, *Mj, Mnz;
209: PetscScalar *Ma;
211: PetscFunctionBegin;
212: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
213: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
214: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
215: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
216: Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
217: PetscCall(PetscMalloc1(m + 1, &Mi));
218: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
219: PetscCall(PetscMalloc1(Mnz, &Ma));
220: Mi[0] = 0;
221: for (PetscInt i = 0; i < m; i++) {
222: PetscInt llen = Ai[i + 1] - Ai[i];
223: PetscInt ulen = adiag[i] - adiag[i + 1];
224: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
225: Mj[Mi[i] + llen] = i; // diagonal entry
226: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
227: Mi[i + 1] = Mi[i] + llen + ulen;
228: }
229: // Copy M (L,U) from host to device
230: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
231: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
232: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
233: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
234: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
236: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
237: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
238: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
239: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
240: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
241: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
242: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
243: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
245: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
246: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
247: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
249: fillMode = CUSPARSE_FILL_MODE_UPPER;
250: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
251: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
252: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
253: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
255: // Allocate work vectors in SpSv
256: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
257: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
259: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
260: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
262: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
263: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
264: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
265: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
266: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
267: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
268: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
270: // Record for reuse
271: fs->csrRowPtr_h = Mi;
272: fs->csrVal_h = Ma;
273: PetscCall(PetscFree(Mj));
274: }
275: // Copy the value
276: Mi = fs->csrRowPtr_h;
277: Ma = fs->csrVal_h;
278: Mnz = Mi[m];
279: for (PetscInt i = 0; i < m; i++) {
280: PetscInt llen = Ai[i + 1] - Ai[i];
281: PetscInt ulen = adiag[i] - adiag[i + 1];
282: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
283: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
284: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
285: }
286: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
288: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
289: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
290: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
291: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
293: } else
294: #endif
295: {
296: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
297: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
299: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
300: fs->updatedSpSVAnalysis = PETSC_TRUE;
301: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
302: }
303: }
304: PetscFunctionReturn(PETSC_SUCCESS);
305: }
306: #else
307: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
308: {
309: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
310: PetscInt n = A->rmap->n;
311: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
312: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
313: const PetscInt *ai = a->i, *aj = a->j, *vi;
314: const MatScalar *aa = a->a, *v;
315: PetscInt *AiLo, *AjLo;
316: PetscInt i, nz, nzLower, offset, rowOffset;
318: PetscFunctionBegin;
319: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
320: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
321: try {
322: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
323: nzLower = n + ai[n] - ai[1];
324: if (!loTriFactor) {
325: PetscScalar *AALo;
327: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
329: /* Allocate Space for the lower triangular matrix */
330: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
331: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
333: /* Fill the lower triangular matrix */
334: AiLo[0] = (PetscInt)0;
335: AiLo[n] = nzLower;
336: AjLo[0] = (PetscInt)0;
337: AALo[0] = (MatScalar)1.0;
338: v = aa;
339: vi = aj;
340: offset = 1;
341: rowOffset = 1;
342: for (i = 1; i < n; i++) {
343: nz = ai[i + 1] - ai[i];
344: /* additional 1 for the term on the diagonal */
345: AiLo[i] = rowOffset;
346: rowOffset += nz + 1;
348: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
349: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
351: offset += nz;
352: AjLo[offset] = (PetscInt)i;
353: AALo[offset] = (MatScalar)1.0;
354: offset += 1;
356: v += nz;
357: vi += nz;
358: }
360: /* allocate space for the triangular factor information */
361: PetscCall(PetscNew(&loTriFactor));
362: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
363: /* Create the matrix description */
364: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
365: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
366: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
367: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
368: #else
369: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
370: #endif
371: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
372: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
374: /* set the operation */
375: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
377: /* set the matrix */
378: loTriFactor->csrMat = new CsrMatrix;
379: loTriFactor->csrMat->num_rows = n;
380: loTriFactor->csrMat->num_cols = n;
381: loTriFactor->csrMat->num_entries = nzLower;
383: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
384: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
386: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
387: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
389: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
390: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
392: /* Create the solve analysis information */
393: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
394: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
395: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
396: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
397: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
398: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
399: #endif
401: /* perform the solve analysis */
402: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
403: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
404: PetscCallCUDA(WaitForCUDA());
405: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
407: /* assign the pointer */
408: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
409: loTriFactor->AA_h = AALo;
410: PetscCallCUDA(cudaFreeHost(AiLo));
411: PetscCallCUDA(cudaFreeHost(AjLo));
412: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
413: } else { /* update values only */
414: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
415: /* Fill the lower triangular matrix */
416: loTriFactor->AA_h[0] = 1.0;
417: v = aa;
418: vi = aj;
419: offset = 1;
420: for (i = 1; i < n; i++) {
421: nz = ai[i + 1] - ai[i];
422: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
423: offset += nz;
424: loTriFactor->AA_h[offset] = 1.0;
425: offset += 1;
426: v += nz;
427: }
428: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
429: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
430: }
431: } catch (char *ex) {
432: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
433: }
434: }
435: PetscFunctionReturn(PETSC_SUCCESS);
436: }
438: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
439: {
440: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
441: PetscInt n = A->rmap->n;
442: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
443: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
444: const PetscInt *aj = a->j, *adiag, *vi;
445: const MatScalar *aa = a->a, *v;
446: PetscInt *AiUp, *AjUp;
447: PetscInt i, nz, nzUpper, offset;
449: PetscFunctionBegin;
450: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
451: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
452: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
453: try {
454: /* next, figure out the number of nonzeros in the upper triangular matrix. */
455: nzUpper = adiag[0] - adiag[n];
456: if (!upTriFactor) {
457: PetscScalar *AAUp;
459: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
461: /* Allocate Space for the upper triangular matrix */
462: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
463: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
465: /* Fill the upper triangular matrix */
466: AiUp[0] = (PetscInt)0;
467: AiUp[n] = nzUpper;
468: offset = nzUpper;
469: for (i = n - 1; i >= 0; i--) {
470: v = aa + adiag[i + 1] + 1;
471: vi = aj + adiag[i + 1] + 1;
473: /* number of elements NOT on the diagonal */
474: nz = adiag[i] - adiag[i + 1] - 1;
476: /* decrement the offset */
477: offset -= (nz + 1);
479: /* first, set the diagonal elements */
480: AjUp[offset] = (PetscInt)i;
481: AAUp[offset] = (MatScalar)1. / v[nz];
482: AiUp[i] = AiUp[i + 1] - (nz + 1);
484: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
485: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
486: }
488: /* allocate space for the triangular factor information */
489: PetscCall(PetscNew(&upTriFactor));
490: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
492: /* Create the matrix description */
493: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
494: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
495: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
496: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
497: #else
498: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
499: #endif
500: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
501: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
503: /* set the operation */
504: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
506: /* set the matrix */
507: upTriFactor->csrMat = new CsrMatrix;
508: upTriFactor->csrMat->num_rows = n;
509: upTriFactor->csrMat->num_cols = n;
510: upTriFactor->csrMat->num_entries = nzUpper;
512: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
513: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
515: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
516: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
518: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
519: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
521: /* Create the solve analysis information */
522: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
523: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
524: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
525: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
526: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
527: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
528: #endif
530: /* perform the solve analysis */
531: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
532: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
534: PetscCallCUDA(WaitForCUDA());
535: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537: /* assign the pointer */
538: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
539: upTriFactor->AA_h = AAUp;
540: PetscCallCUDA(cudaFreeHost(AiUp));
541: PetscCallCUDA(cudaFreeHost(AjUp));
542: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
543: } else {
544: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
545: /* Fill the upper triangular matrix */
546: offset = nzUpper;
547: for (i = n - 1; i >= 0; i--) {
548: v = aa + adiag[i + 1] + 1;
550: /* number of elements NOT on the diagonal */
551: nz = adiag[i] - adiag[i + 1] - 1;
553: /* decrement the offset */
554: offset -= (nz + 1);
556: /* first, set the diagonal elements */
557: upTriFactor->AA_h[offset] = 1. / v[nz];
558: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
559: }
560: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
561: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
562: }
563: } catch (char *ex) {
564: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
565: }
566: }
567: PetscFunctionReturn(PETSC_SUCCESS);
568: }
569: #endif
571: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
572: {
573: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
574: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
575: IS isrow = a->row, isicol = a->icol;
576: PetscBool row_identity, col_identity;
577: PetscInt n = A->rmap->n;
579: PetscFunctionBegin;
580: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
581: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
582: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
583: #else
584: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
585: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
586: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
587: #endif
589: cusparseTriFactors->nnz = a->nz;
591: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
592: /* lower triangular indices */
593: PetscCall(ISIdentity(isrow, &row_identity));
594: if (!row_identity && !cusparseTriFactors->rpermIndices) {
595: const PetscInt *r;
597: PetscCall(ISGetIndices(isrow, &r));
598: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
599: cusparseTriFactors->rpermIndices->assign(r, r + n);
600: PetscCall(ISRestoreIndices(isrow, &r));
601: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
602: }
604: /* upper triangular indices */
605: PetscCall(ISIdentity(isicol, &col_identity));
606: if (!col_identity && !cusparseTriFactors->cpermIndices) {
607: const PetscInt *c;
609: PetscCall(ISGetIndices(isicol, &c));
610: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
611: cusparseTriFactors->cpermIndices->assign(c, c + n);
612: PetscCall(ISRestoreIndices(isicol, &c));
613: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
614: }
615: PetscFunctionReturn(PETSC_SUCCESS);
616: }
618: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
619: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
620: {
621: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
622: PetscInt m = A->rmap->n;
623: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
624: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
625: const MatScalar *Aa = a->a;
626: PetscInt *Mj, Mnz;
627: PetscScalar *Ma, *D;
629: PetscFunctionBegin;
630: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
631: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
632: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
633: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
634: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
635: Mnz = Ai[m]; // Unz (with the unit diagonal)
636: PetscCall(PetscMalloc1(Mnz, &Ma));
637: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
638: PetscCall(PetscMalloc1(m, &D)); // the diagonal
639: for (PetscInt i = 0; i < m; i++) {
640: PetscInt ulen = Ai[i + 1] - Ai[i];
641: Mj[Ai[i]] = i; // diagonal entry
642: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
643: }
644: // Copy M (U) from host to device
645: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
646: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
647: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
648: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
649: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
650: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
652: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
653: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
654: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
655: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
656: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
657: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
658: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
659: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
661: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
662: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
663: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
665: // Allocate work vectors in SpSv
666: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
667: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
669: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
670: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
672: // Query buffer sizes for SpSV and then allocate buffers
673: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
674: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
675: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
677: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
678: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
679: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
681: // Record for reuse
682: fs->csrVal_h = Ma;
683: fs->diag_h = D;
684: PetscCall(PetscFree(Mj));
685: }
686: // Copy the value
687: Ma = fs->csrVal_h;
688: D = fs->diag_h;
689: Mnz = Ai[m];
690: for (PetscInt i = 0; i < m; i++) {
691: D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
692: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
693: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
694: }
695: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
696: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
698: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
699: if (fs->updatedSpSVAnalysis) {
700: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
702: } else
703: #endif
704: {
705: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
706: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
707: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
708: fs->updatedSpSVAnalysis = PETSC_TRUE;
709: }
710: }
711: PetscFunctionReturn(PETSC_SUCCESS);
712: }
714: // Solve Ut D U x = b
715: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
716: {
717: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
718: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
719: const PetscScalar *barray;
720: PetscScalar *xarray;
721: thrust::device_ptr<const PetscScalar> bGPU;
722: thrust::device_ptr<PetscScalar> xGPU;
723: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
724: PetscInt m = A->rmap->n;
726: PetscFunctionBegin;
727: PetscCall(PetscLogGpuTimeBegin());
728: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
729: PetscCall(VecCUDAGetArrayRead(b, &barray));
730: xGPU = thrust::device_pointer_cast(xarray);
731: bGPU = thrust::device_pointer_cast(barray);
733: // Reorder b with the row permutation if needed, and wrap the result in fs->X
734: if (fs->rpermIndices) {
735: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
736: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
737: } else {
738: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
739: }
741: // Solve Ut Y = X
742: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
743: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
745: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
746: // It is basically a vector element-wise multiplication, but cublas does not have it!
747: #if CCCL_VERSION >= 3001000
748: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
749: #else
750: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751: #endif
753: // Solve U X = Y
754: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756: } else {
757: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758: }
759: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
761: // Reorder X with the column permutation if needed, and put the result back to x
762: if (fs->cpermIndices) {
763: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765: }
767: PetscCall(VecCUDARestoreArrayRead(b, &barray));
768: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769: PetscCall(PetscLogGpuTimeEnd());
770: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771: PetscFunctionReturn(PETSC_SUCCESS);
772: }
773: #else
774: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775: {
776: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
777: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780: PetscInt *AiUp, *AjUp;
781: PetscScalar *AAUp;
782: PetscScalar *AALo;
783: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
785: const PetscInt *ai = b->i, *aj = b->j, *vj;
786: const MatScalar *aa = b->a, *v;
788: PetscFunctionBegin;
789: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791: try {
792: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
793: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794: if (!upTriFactor && !loTriFactor) {
795: /* Allocate Space for the upper triangular matrix */
796: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
797: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
799: /* Fill the upper triangular matrix */
800: AiUp[0] = (PetscInt)0;
801: AiUp[n] = nzUpper;
802: offset = 0;
803: for (i = 0; i < n; i++) {
804: /* set the pointers */
805: v = aa + ai[i];
806: vj = aj + ai[i];
807: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
809: /* first, set the diagonal elements */
810: AjUp[offset] = (PetscInt)i;
811: AAUp[offset] = (MatScalar)1.0 / v[nz];
812: AiUp[i] = offset;
813: AALo[offset] = (MatScalar)1.0 / v[nz];
815: offset += 1;
816: if (nz > 0) {
817: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819: for (j = offset; j < offset + nz; j++) {
820: AAUp[j] = -AAUp[j];
821: AALo[j] = AAUp[j] / v[nz];
822: }
823: offset += nz;
824: }
825: }
827: /* allocate space for the triangular factor information */
828: PetscCall(PetscNew(&upTriFactor));
829: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
831: /* Create the matrix description */
832: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
833: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
834: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
835: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836: #else
837: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838: #endif
839: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
840: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
842: /* set the matrix */
843: upTriFactor->csrMat = new CsrMatrix;
844: upTriFactor->csrMat->num_rows = A->rmap->n;
845: upTriFactor->csrMat->num_cols = A->cmap->n;
846: upTriFactor->csrMat->num_entries = a->nz;
848: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
851: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
854: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
857: /* set the operation */
858: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
860: /* Create the solve analysis information */
861: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
863: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
864: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
865: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
866: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867: #endif
869: /* perform the solve analysis */
870: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
871: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
873: PetscCallCUDA(WaitForCUDA());
874: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
876: /* assign the pointer */
877: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
879: /* allocate space for the triangular factor information */
880: PetscCall(PetscNew(&loTriFactor));
881: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
883: /* Create the matrix description */
884: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
885: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
886: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
887: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888: #else
889: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890: #endif
891: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
892: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
894: /* set the operation */
895: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
897: /* set the matrix */
898: loTriFactor->csrMat = new CsrMatrix;
899: loTriFactor->csrMat->num_rows = A->rmap->n;
900: loTriFactor->csrMat->num_cols = A->cmap->n;
901: loTriFactor->csrMat->num_entries = a->nz;
903: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
906: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
909: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
912: /* Create the solve analysis information */
913: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
915: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
916: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
917: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
918: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919: #endif
921: /* perform the solve analysis */
922: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
923: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
925: PetscCallCUDA(WaitForCUDA());
926: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
928: /* assign the pointer */
929: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
931: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
932: PetscCallCUDA(cudaFreeHost(AiUp));
933: PetscCallCUDA(cudaFreeHost(AjUp));
934: } else {
935: /* Fill the upper triangular matrix */
936: offset = 0;
937: for (i = 0; i < n; i++) {
938: /* set the pointers */
939: v = aa + ai[i];
940: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
942: /* first, set the diagonal elements */
943: AAUp[offset] = 1.0 / v[nz];
944: AALo[offset] = 1.0 / v[nz];
946: offset += 1;
947: if (nz > 0) {
948: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949: for (j = offset; j < offset + nz; j++) {
950: AAUp[j] = -AAUp[j];
951: AALo[j] = AAUp[j] / v[nz];
952: }
953: offset += nz;
954: }
955: }
956: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
960: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961: }
962: PetscCallCUDA(cudaFreeHost(AAUp));
963: PetscCallCUDA(cudaFreeHost(AALo));
964: } catch (char *ex) {
965: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966: }
967: }
968: PetscFunctionReturn(PETSC_SUCCESS);
969: }
970: #endif
972: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973: {
974: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
975: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976: IS ip = a->row;
977: PetscBool perm_identity;
978: PetscInt n = A->rmap->n;
980: PetscFunctionBegin;
981: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
983: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
985: #else
986: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988: #endif
989: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
991: A->offloadmask = PETSC_OFFLOAD_BOTH;
993: /* lower triangular indices */
994: PetscCall(ISIdentity(ip, &perm_identity));
995: if (!perm_identity) {
996: IS iip;
997: const PetscInt *irip, *rip;
999: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
1000: PetscCall(ISGetIndices(iip, &irip));
1001: PetscCall(ISGetIndices(ip, &rip));
1002: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1005: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1006: PetscCall(ISRestoreIndices(iip, &irip));
1007: PetscCall(ISDestroy(&iip));
1008: PetscCall(ISRestoreIndices(ip, &rip));
1009: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010: }
1011: PetscFunctionReturn(PETSC_SUCCESS);
1012: }
1014: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015: {
1016: PetscFunctionBegin;
1017: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1018: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019: B->offloadmask = PETSC_OFFLOAD_CPU;
1021: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024: #else
1025: /* determine which version of MatSolve needs to be used. */
1026: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1027: IS ip = b->row;
1028: PetscBool perm_identity;
1030: PetscCall(ISIdentity(ip, &perm_identity));
1031: if (perm_identity) {
1032: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034: } else {
1035: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1036: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037: }
1038: #endif
1039: B->ops->matsolve = NULL;
1040: B->ops->matsolvetranspose = NULL;
1042: /* get the triangular factors */
1043: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1044: PetscFunctionReturn(PETSC_SUCCESS);
1045: }
1047: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049: {
1050: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055: cusparseIndexBase_t indexBase;
1056: cusparseMatrixType_t matrixType;
1057: cusparseFillMode_t fillMode;
1058: cusparseDiagType_t diagType;
1060: PetscFunctionBegin;
1061: /* allocate space for the transpose of the lower triangular factor */
1062: PetscCall(PetscNew(&loTriFactorT));
1063: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1065: /* set the matrix descriptors of the lower triangular factor */
1066: matrixType = cusparseGetMatType(loTriFactor->descr);
1067: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1068: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1071: /* Create the matrix description */
1072: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1073: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1074: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1075: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1076: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1078: /* set the operation */
1079: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1081: /* allocate GPU space for the CSC of the lower triangular factor*/
1082: loTriFactorT->csrMat = new CsrMatrix;
1083: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1084: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1085: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1086: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1090: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1092: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1093: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1095: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096: #endif
1098: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1099: {
1100: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1101: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1102: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1104: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105: #else
1106: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107: #endif
1108: PetscCallCUSPARSE(stat);
1109: }
1111: PetscCallCUDA(WaitForCUDA());
1112: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1114: /* Create the solve analysis information */
1115: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1117: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1118: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1119: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1120: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121: #endif
1123: /* perform the solve analysis */
1124: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1125: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1127: PetscCallCUDA(WaitForCUDA());
1128: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1130: /* assign the pointer */
1131: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1133: /*********************************************/
1134: /* Now the Transpose of the Upper Tri Factor */
1135: /*********************************************/
1137: /* allocate space for the transpose of the upper triangular factor */
1138: PetscCall(PetscNew(&upTriFactorT));
1139: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1141: /* set the matrix descriptors of the upper triangular factor */
1142: matrixType = cusparseGetMatType(upTriFactor->descr);
1143: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1144: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1147: /* Create the matrix description */
1148: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1149: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1150: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1151: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1152: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1154: /* set the operation */
1155: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1157: /* allocate GPU space for the CSC of the upper triangular factor*/
1158: upTriFactorT->csrMat = new CsrMatrix;
1159: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1160: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1161: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1162: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1166: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1168: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1169: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1171: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172: #endif
1174: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1175: {
1176: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1177: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1178: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1180: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181: #else
1182: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183: #endif
1184: PetscCallCUSPARSE(stat);
1185: }
1187: PetscCallCUDA(WaitForCUDA());
1188: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1190: /* Create the solve analysis information */
1191: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1193: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1194: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1195: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1196: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197: #endif
1199: /* perform the solve analysis */
1200: /* christ, would it have killed you to put this stuff in a function????????? */
1201: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1202: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1204: PetscCallCUDA(WaitForCUDA());
1205: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1207: /* assign the pointer */
1208: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1209: PetscFunctionReturn(PETSC_SUCCESS);
1210: }
1211: #endif
1213: struct PetscScalarToPetscInt {
1214: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215: };
1217: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218: {
1219: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1222: cusparseStatus_t stat;
1223: cusparseIndexBase_t indexBase;
1225: PetscFunctionBegin;
1226: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1228: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1230: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1231: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1232: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1233: PetscCall(PetscLogGpuTimeBegin());
1234: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1237: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1239: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1240: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1242: /* set alpha and beta */
1243: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1246: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1247: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1250: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251: CsrMatrix *matrixT = new CsrMatrix;
1252: matstructT->mat = matrixT;
1253: matrixT->num_rows = A->cmap->n;
1254: matrixT->num_cols = A->rmap->n;
1255: matrixT->num_entries = a->nz;
1256: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258: matrixT->values = new THRUSTARRAY(a->nz);
1260: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1261: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1263: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1264: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1265: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266: indexBase, cusparse_scalartype);
1267: PetscCallCUSPARSE(stat);
1268: #else
1269: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1270: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1272: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1273: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1274: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1275: */
1276: if (matrixT->num_entries) {
1277: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1278: PetscCallCUSPARSE(stat);
1280: } else {
1281: matstructT->matDescr = NULL;
1282: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1283: }
1284: #endif
1285: #endif
1286: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289: #else
1290: CsrMatrix *temp = new CsrMatrix;
1291: CsrMatrix *tempT = new CsrMatrix;
1292: /* First convert HYB to CSR */
1293: temp->num_rows = A->rmap->n;
1294: temp->num_cols = A->cmap->n;
1295: temp->num_entries = a->nz;
1296: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1297: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298: temp->values = new THRUSTARRAY(a->nz);
1300: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1301: PetscCallCUSPARSE(stat);
1303: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304: tempT->num_rows = A->rmap->n;
1305: tempT->num_cols = A->cmap->n;
1306: tempT->num_entries = a->nz;
1307: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1308: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309: tempT->values = new THRUSTARRAY(a->nz);
1311: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1312: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1313: PetscCallCUSPARSE(stat);
1315: /* Last, convert CSC to HYB */
1316: cusparseHybMat_t hybMat;
1317: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1319: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1320: PetscCallCUSPARSE(stat);
1322: /* assign the pointer */
1323: matstructT->mat = hybMat;
1324: A->transupdated = PETSC_TRUE;
1325: /* delete temporaries */
1326: if (tempT) {
1327: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330: delete (CsrMatrix *)tempT;
1331: }
1332: if (temp) {
1333: if (temp->values) delete (THRUSTARRAY *)temp->values;
1334: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336: delete (CsrMatrix *)temp;
1337: }
1338: #endif
1339: }
1340: }
1341: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1343: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1344: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1345: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1346: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1347: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1348: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1349: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1350: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1351: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1355: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356: }
1357: if (!cusparsestruct->csr2csc_i) {
1358: THRUSTARRAY csr2csc_a(matrix->num_entries);
1359: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1361: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363: void *csr2cscBuffer;
1364: size_t csr2cscBufferSize;
1365: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1366: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1367: PetscCallCUSPARSE(stat);
1368: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369: #endif
1371: if (matrix->num_entries) {
1372: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1376: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377: should be filled with indexBase. So I just take a shortcut here.
1378: */
1379: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1381: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1382: PetscCallCUSPARSE(stat);
1383: #else
1384: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1385: PetscCallCUSPARSE(stat);
1386: #endif
1387: } else {
1388: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1389: }
1391: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1394: PetscCallCUDA(cudaFree(csr2cscBuffer));
1395: #endif
1396: }
1397: PetscCallThrust(
1398: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399: }
1400: PetscCall(PetscLogGpuTimeEnd());
1401: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402: /* the compressed row indices is not used for matTranspose */
1403: matstructT->cprowIndices = NULL;
1404: /* assign the pointer */
1405: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1406: A->transupdated = PETSC_TRUE;
1407: PetscFunctionReturn(PETSC_SUCCESS);
1408: }
1410: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412: {
1413: const PetscScalar *barray;
1414: PetscScalar *xarray;
1415: thrust::device_ptr<const PetscScalar> bGPU;
1416: thrust::device_ptr<PetscScalar> xGPU;
1417: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421: PetscInt m = A->rmap->n;
1423: PetscFunctionBegin;
1424: PetscCall(PetscLogGpuTimeBegin());
1425: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426: PetscCall(VecCUDAGetArrayRead(b, &barray));
1427: xGPU = thrust::device_pointer_cast(xarray);
1428: bGPU = thrust::device_pointer_cast(barray);
1430: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431: if (fs->rpermIndices) {
1432: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434: } else {
1435: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436: }
1438: // Solve L Y = X
1439: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1443: // Solve U X = Y
1444: if (fs->cpermIndices) {
1445: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446: } else {
1447: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448: }
1449: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1451: // Reorder X with the column permutation if needed, and put the result back to x
1452: if (fs->cpermIndices) {
1453: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455: }
1456: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458: PetscCall(PetscLogGpuTimeEnd());
1459: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460: PetscFunctionReturn(PETSC_SUCCESS);
1461: }
1463: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464: {
1465: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467: const PetscScalar *barray;
1468: PetscScalar *xarray;
1469: thrust::device_ptr<const PetscScalar> bGPU;
1470: thrust::device_ptr<PetscScalar> xGPU;
1471: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1472: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473: PetscInt m = A->rmap->n;
1475: PetscFunctionBegin;
1476: PetscCall(PetscLogGpuTimeBegin());
1477: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1482: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487: }
1489: if (!fs->updatedTransposeSpSVAnalysis) {
1490: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1492: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494: }
1496: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497: PetscCall(VecCUDAGetArrayRead(b, &barray));
1498: xGPU = thrust::device_pointer_cast(xarray);
1499: bGPU = thrust::device_pointer_cast(barray);
1501: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502: if (fs->rpermIndices) {
1503: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505: } else {
1506: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507: }
1509: // Solve Ut Y = X
1510: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1513: // Solve Lt X = Y
1514: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516: } else {
1517: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518: }
1519: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1521: // Reorder X with the column permutation if needed, and put the result back to x
1522: if (fs->cpermIndices) {
1523: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525: }
1527: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529: PetscCall(PetscLogGpuTimeEnd());
1530: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531: PetscFunctionReturn(PETSC_SUCCESS);
1532: }
1533: #else
1534: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536: {
1537: PetscInt n = xx->map->n;
1538: const PetscScalar *barray;
1539: PetscScalar *xarray;
1540: thrust::device_ptr<const PetscScalar> bGPU;
1541: thrust::device_ptr<PetscScalar> xGPU;
1542: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1547: PetscFunctionBegin;
1548: /* Analyze the matrix and create the transpose ... on the fly */
1549: if (!loTriFactorT && !upTriFactorT) {
1550: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553: }
1555: /* Get the GPU pointers */
1556: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1557: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558: xGPU = thrust::device_pointer_cast(xarray);
1559: bGPU = thrust::device_pointer_cast(barray);
1561: PetscCall(PetscLogGpuTimeBegin());
1562: /* First, reorder with the row permutation */
1563: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1565: /* First, solve U */
1566: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1567: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1569: /* Then, solve L */
1570: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1571: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1573: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1576: /* Copy the temporary to the full solution. */
1577: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1579: /* restore */
1580: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1581: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1582: PetscCall(PetscLogGpuTimeEnd());
1583: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1584: PetscFunctionReturn(PETSC_SUCCESS);
1585: }
1587: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588: {
1589: const PetscScalar *barray;
1590: PetscScalar *xarray;
1591: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1596: PetscFunctionBegin;
1597: /* Analyze the matrix and create the transpose ... on the fly */
1598: if (!loTriFactorT && !upTriFactorT) {
1599: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602: }
1604: /* Get the GPU pointers */
1605: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1606: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1608: PetscCall(PetscLogGpuTimeBegin());
1609: /* First, solve U */
1610: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1611: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1613: /* Then, solve L */
1614: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1615: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1617: /* restore */
1618: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1619: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1620: PetscCall(PetscLogGpuTimeEnd());
1621: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1622: PetscFunctionReturn(PETSC_SUCCESS);
1623: }
1625: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626: {
1627: const PetscScalar *barray;
1628: PetscScalar *xarray;
1629: thrust::device_ptr<const PetscScalar> bGPU;
1630: thrust::device_ptr<PetscScalar> xGPU;
1631: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1636: PetscFunctionBegin;
1637: /* Get the GPU pointers */
1638: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1639: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640: xGPU = thrust::device_pointer_cast(xarray);
1641: bGPU = thrust::device_pointer_cast(barray);
1643: PetscCall(PetscLogGpuTimeBegin());
1644: /* First, reorder with the row permutation */
1645: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1647: /* Next, solve L */
1648: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1649: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1651: /* Then, solve U */
1652: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1653: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1655: /* Last, reorder with the column permutation */
1656: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1658: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1659: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1660: PetscCall(PetscLogGpuTimeEnd());
1661: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1662: PetscFunctionReturn(PETSC_SUCCESS);
1663: }
1665: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666: {
1667: const PetscScalar *barray;
1668: PetscScalar *xarray;
1669: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1674: PetscFunctionBegin;
1675: /* Get the GPU pointers */
1676: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1677: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1679: PetscCall(PetscLogGpuTimeBegin());
1680: /* First, solve L */
1681: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1682: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1684: /* Next, solve U */
1685: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1686: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1688: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1689: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1690: PetscCall(PetscLogGpuTimeEnd());
1691: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1692: PetscFunctionReturn(PETSC_SUCCESS);
1693: }
1694: #endif
1696: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1697: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698: {
1699: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1701: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702: CsrMatrix *Acsr;
1703: PetscInt m, nz;
1704: PetscBool flg;
1706: PetscFunctionBegin;
1707: if (PetscDefined(USE_DEBUG)) {
1708: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710: }
1712: /* Copy A's value to fact */
1713: m = fact->rmap->n;
1714: nz = aij->nz;
1715: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716: Acsr = (CsrMatrix *)Acusp->mat->mat;
1717: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1719: PetscCall(PetscLogGpuTimeBegin());
1720: /* Factorize fact inplace */
1721: if (m)
1722: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724: if (PetscDefined(USE_DEBUG)) {
1725: int numerical_zero;
1726: cusparseStatus_t status;
1727: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729: }
1731: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1732: if (fs->updatedSpSVAnalysis) {
1733: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1734: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1735: } else
1736: #endif
1737: {
1738: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1739: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1740: */
1741: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1743: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1745: fs->updatedSpSVAnalysis = PETSC_TRUE;
1746: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1747: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1748: }
1750: fact->offloadmask = PETSC_OFFLOAD_GPU;
1751: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1752: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1753: fact->ops->matsolve = NULL;
1754: fact->ops->matsolvetranspose = NULL;
1755: PetscCall(PetscLogGpuTimeEnd());
1756: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1757: PetscFunctionReturn(PETSC_SUCCESS);
1758: }
1760: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1761: {
1762: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1763: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1764: PetscInt m, nz;
1766: PetscFunctionBegin;
1767: if (PetscDefined(USE_DEBUG)) {
1768: PetscBool flg, diagDense;
1770: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1771: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1772: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1773: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1774: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1775: }
1777: /* Free the old stale stuff */
1778: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1780: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1781: but they will not be used. Allocate them just for easy debugging.
1782: */
1783: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1785: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1786: fact->factortype = MAT_FACTOR_ILU;
1787: fact->info.factor_mallocs = 0;
1788: fact->info.fill_ratio_given = info->fill;
1789: fact->info.fill_ratio_needed = 1.0;
1791: aij->row = NULL;
1792: aij->col = NULL;
1794: /* ====================================================================== */
1795: /* Copy A's i, j to fact and also allocate the value array of fact. */
1796: /* We'll do in-place factorization on fact */
1797: /* ====================================================================== */
1798: const int *Ai, *Aj;
1800: m = fact->rmap->n;
1801: nz = aij->nz;
1803: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1804: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1805: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1806: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1807: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1808: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1810: /* ====================================================================== */
1811: /* Create descriptors for M, L, U */
1812: /* ====================================================================== */
1813: cusparseFillMode_t fillMode;
1814: cusparseDiagType_t diagType;
1816: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1817: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1818: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1820: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1821: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1822: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1823: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1824: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1825: */
1826: fillMode = CUSPARSE_FILL_MODE_LOWER;
1827: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1828: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1829: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1830: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1832: fillMode = CUSPARSE_FILL_MODE_UPPER;
1833: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1834: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1835: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1836: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1838: /* ========================================================================= */
1839: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1840: /* ========================================================================= */
1841: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1842: if (m)
1843: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1844: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1846: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1847: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1849: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1850: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1852: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1853: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1855: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1856: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1858: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1859: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1860: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1861: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1862: */
1863: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1864: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1865: fs->spsvBuffer_L = fs->factBuffer_M;
1866: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1867: } else {
1868: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1869: fs->spsvBuffer_U = fs->factBuffer_M;
1870: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1871: }
1873: /* ========================================================================== */
1874: /* Perform analysis of ilu0 on M, SpSv on L and U */
1875: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1876: /* ========================================================================== */
1877: int structural_zero;
1878: cusparseStatus_t status;
1880: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1881: if (m)
1882: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1883: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1884: if (PetscDefined(USE_DEBUG)) {
1885: /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1886: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1887: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1888: }
1890: /* Estimate FLOPs of the numeric factorization */
1891: {
1892: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1893: PetscInt *Ai, nzRow, nzLeft;
1894: const PetscInt *adiag;
1895: PetscLogDouble flops = 0.0;
1897: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1898: Ai = Aseq->i;
1899: for (PetscInt i = 0; i < m; i++) {
1900: if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1901: nzRow = Ai[i + 1] - Ai[i];
1902: nzLeft = adiag[i] - Ai[i];
1903: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1904: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1905: */
1906: nzLeft = (nzRow - 1) / 2;
1907: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1908: }
1909: }
1910: fs->numericFactFlops = flops;
1911: }
1912: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1913: PetscFunctionReturn(PETSC_SUCCESS);
1914: }
1916: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1917: {
1918: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1919: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1920: const PetscScalar *barray;
1921: PetscScalar *xarray;
1923: PetscFunctionBegin;
1924: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1925: PetscCall(VecCUDAGetArrayRead(b, &barray));
1926: PetscCall(PetscLogGpuTimeBegin());
1928: /* Solve L*y = b */
1929: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1930: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1931: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1932: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1934: /* Solve Lt*x = y */
1935: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1936: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1937: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1939: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1940: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1942: PetscCall(PetscLogGpuTimeEnd());
1943: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1944: PetscFunctionReturn(PETSC_SUCCESS);
1945: }
1947: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1948: {
1949: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1950: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1951: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1952: CsrMatrix *Acsr;
1953: PetscInt m, nz;
1954: PetscBool flg;
1956: PetscFunctionBegin;
1957: if (PetscDefined(USE_DEBUG)) {
1958: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1959: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1960: }
1962: /* Copy A's value to fact */
1963: m = fact->rmap->n;
1964: nz = aij->nz;
1965: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1966: Acsr = (CsrMatrix *)Acusp->mat->mat;
1967: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1969: /* Factorize fact inplace */
1970: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1971: csric02() only takes the lower triangular part of matrix A to perform factorization.
1972: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1973: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1974: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1975: */
1976: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1977: if (PetscDefined(USE_DEBUG)) {
1978: int numerical_zero;
1979: cusparseStatus_t status;
1980: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1981: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1982: }
1984: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1985: if (fs->updatedSpSVAnalysis) {
1986: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1987: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1988: } else
1989: #endif
1990: {
1991: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1993: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1994: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1995: */
1996: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1997: fs->updatedSpSVAnalysis = PETSC_TRUE;
1998: }
2000: fact->offloadmask = PETSC_OFFLOAD_GPU;
2001: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
2002: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2003: fact->ops->matsolve = NULL;
2004: fact->ops->matsolvetranspose = NULL;
2005: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2006: PetscFunctionReturn(PETSC_SUCCESS);
2007: }
2009: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2010: {
2011: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2012: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2013: PetscInt m, nz;
2015: PetscFunctionBegin;
2016: if (PetscDefined(USE_DEBUG)) {
2017: PetscBool flg, diagDense;
2019: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2020: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2021: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2022: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2023: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2024: }
2026: /* Free the old stale stuff */
2027: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2029: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2030: but they will not be used. Allocate them just for easy debugging.
2031: */
2032: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2034: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2035: fact->factortype = MAT_FACTOR_ICC;
2036: fact->info.factor_mallocs = 0;
2037: fact->info.fill_ratio_given = info->fill;
2038: fact->info.fill_ratio_needed = 1.0;
2040: aij->row = NULL;
2041: aij->col = NULL;
2043: /* ====================================================================== */
2044: /* Copy A's i, j to fact and also allocate the value array of fact. */
2045: /* We'll do in-place factorization on fact */
2046: /* ====================================================================== */
2047: const int *Ai, *Aj;
2049: m = fact->rmap->n;
2050: nz = aij->nz;
2052: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2053: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2054: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2055: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2056: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2059: /* ====================================================================== */
2060: /* Create mat descriptors for M, L */
2061: /* ====================================================================== */
2062: cusparseFillMode_t fillMode;
2063: cusparseDiagType_t diagType;
2065: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2066: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2067: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2069: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2070: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2071: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2072: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2073: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2074: */
2075: fillMode = CUSPARSE_FILL_MODE_LOWER;
2076: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2077: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2078: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2079: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2081: /* ========================================================================= */
2082: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2083: /* ========================================================================= */
2084: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2085: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2087: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2088: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2090: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2091: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2093: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2094: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2096: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2097: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2099: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2100: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2101: */
2102: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2103: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2104: fs->spsvBuffer_L = fs->factBuffer_M;
2105: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2106: } else {
2107: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2108: fs->spsvBuffer_Lt = fs->factBuffer_M;
2109: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2110: }
2112: /* ========================================================================== */
2113: /* Perform analysis of ic0 on M */
2114: /* The lower triangular part of M has the same sparsity pattern as L */
2115: /* ========================================================================== */
2116: int structural_zero;
2117: cusparseStatus_t status;
2119: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2120: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2121: if (PetscDefined(USE_DEBUG)) {
2122: /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2123: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2124: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2125: }
2127: /* Estimate FLOPs of the numeric factorization */
2128: {
2129: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2130: PetscInt *Ai, nzRow, nzLeft;
2131: PetscLogDouble flops = 0.0;
2133: Ai = Aseq->i;
2134: for (PetscInt i = 0; i < m; i++) {
2135: nzRow = Ai[i + 1] - Ai[i];
2136: if (nzRow > 1) {
2137: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2138: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2139: */
2140: nzLeft = (nzRow - 1) / 2;
2141: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2142: }
2143: }
2144: fs->numericFactFlops = flops;
2145: }
2146: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2147: PetscFunctionReturn(PETSC_SUCCESS);
2148: }
2149: #endif
2151: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2152: {
2153: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2154: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2156: PetscFunctionBegin;
2157: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2158: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2159: B->offloadmask = PETSC_OFFLOAD_CPU;
2161: if (!cusparsestruct->use_cpu_solve) {
2162: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2163: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2164: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2165: #else
2166: /* determine which version of MatSolve needs to be used. */
2167: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2168: IS isrow = b->row, iscol = b->col;
2169: PetscBool row_identity, col_identity;
2171: PetscCall(ISIdentity(isrow, &row_identity));
2172: PetscCall(ISIdentity(iscol, &col_identity));
2173: if (row_identity && col_identity) {
2174: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2175: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2176: } else {
2177: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2178: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2179: }
2180: #endif
2181: }
2182: B->ops->matsolve = NULL;
2183: B->ops->matsolvetranspose = NULL;
2185: /* get the triangular factors */
2186: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2187: PetscFunctionReturn(PETSC_SUCCESS);
2188: }
2190: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2191: {
2192: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2194: PetscFunctionBegin;
2195: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2196: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2197: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2198: PetscFunctionReturn(PETSC_SUCCESS);
2199: }
2201: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2202: {
2203: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2205: PetscFunctionBegin;
2206: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2207: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2208: if (!info->factoronhost) {
2209: PetscCall(ISIdentity(isrow, &row_identity));
2210: PetscCall(ISIdentity(iscol, &col_identity));
2211: }
2212: if (!info->levels && row_identity && col_identity) {
2213: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2214: } else
2215: #endif
2216: {
2217: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2218: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2219: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2220: }
2221: PetscFunctionReturn(PETSC_SUCCESS);
2222: }
2224: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2225: {
2226: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2228: PetscFunctionBegin;
2229: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2230: PetscBool perm_identity = PETSC_FALSE;
2231: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2232: if (!info->levels && perm_identity) {
2233: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2234: } else
2235: #endif
2236: {
2237: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2238: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2239: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2240: }
2241: PetscFunctionReturn(PETSC_SUCCESS);
2242: }
2244: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2245: {
2246: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2248: PetscFunctionBegin;
2249: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2250: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2251: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2252: PetscFunctionReturn(PETSC_SUCCESS);
2253: }
2255: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2256: {
2257: PetscFunctionBegin;
2258: *type = MATSOLVERCUSPARSE;
2259: PetscFunctionReturn(PETSC_SUCCESS);
2260: }
2262: /*MC
2263: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2264: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2265: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2266: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2267: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2268: algorithms are not recommended. This class does NOT support direct solver operations.
2270: Level: beginner
2272: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2273: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2274: M*/
2276: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2277: {
2278: PetscInt n = A->rmap->n;
2280: PetscFunctionBegin;
2281: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2282: PetscCall(MatSetSizes(*B, n, n, n, n));
2283: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2284: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2286: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2287: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2288: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2289: if (!A->boundtocpu) {
2290: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2291: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2292: } else {
2293: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2294: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2295: }
2296: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2297: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2298: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2299: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2300: if (!A->boundtocpu) {
2301: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2302: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2303: } else {
2304: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2305: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2306: }
2307: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2308: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2309: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2311: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2312: (*B)->canuseordering = PETSC_TRUE;
2313: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2314: PetscFunctionReturn(PETSC_SUCCESS);
2315: }
2317: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2318: {
2319: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2320: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2321: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2322: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2323: #endif
2325: PetscFunctionBegin;
2326: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2327: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2328: if (A->factortype == MAT_FACTOR_NONE) {
2329: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2330: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331: }
2332: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2333: else if (fs->csrVal) {
2334: /* We have a factorized matrix on device and are able to copy it to host */
2335: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2336: }
2337: #endif
2338: else
2339: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2340: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2341: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2342: A->offloadmask = PETSC_OFFLOAD_BOTH;
2343: }
2344: PetscFunctionReturn(PETSC_SUCCESS);
2345: }
2347: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2348: {
2349: PetscFunctionBegin;
2350: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2351: *array = ((Mat_SeqAIJ *)A->data)->a;
2352: PetscFunctionReturn(PETSC_SUCCESS);
2353: }
2355: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2356: {
2357: PetscFunctionBegin;
2358: A->offloadmask = PETSC_OFFLOAD_CPU;
2359: *array = NULL;
2360: PetscFunctionReturn(PETSC_SUCCESS);
2361: }
2363: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2364: {
2365: PetscFunctionBegin;
2366: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2367: *array = ((Mat_SeqAIJ *)A->data)->a;
2368: PetscFunctionReturn(PETSC_SUCCESS);
2369: }
2371: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2372: {
2373: PetscFunctionBegin;
2374: *array = NULL;
2375: PetscFunctionReturn(PETSC_SUCCESS);
2376: }
2378: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2379: {
2380: PetscFunctionBegin;
2381: *array = ((Mat_SeqAIJ *)A->data)->a;
2382: PetscFunctionReturn(PETSC_SUCCESS);
2383: }
2385: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2386: {
2387: PetscFunctionBegin;
2388: A->offloadmask = PETSC_OFFLOAD_CPU;
2389: *array = NULL;
2390: PetscFunctionReturn(PETSC_SUCCESS);
2391: }
2393: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2394: {
2395: Mat_SeqAIJCUSPARSE *cusp;
2396: CsrMatrix *matrix;
2398: PetscFunctionBegin;
2399: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2400: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2401: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2402: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2403: matrix = (CsrMatrix *)cusp->mat->mat;
2405: if (i) {
2406: #if !defined(PETSC_USE_64BIT_INDICES)
2407: *i = matrix->row_offsets->data().get();
2408: #else
2409: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2410: #endif
2411: }
2412: if (j) {
2413: #if !defined(PETSC_USE_64BIT_INDICES)
2414: *j = matrix->column_indices->data().get();
2415: #else
2416: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2417: #endif
2418: }
2419: if (a) *a = matrix->values->data().get();
2420: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2421: PetscFunctionReturn(PETSC_SUCCESS);
2422: }
2424: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2425: {
2426: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2427: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2428: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2429: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2430: cusparseStatus_t stat;
2431: PetscBool both = PETSC_TRUE;
2433: PetscFunctionBegin;
2434: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2435: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2436: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2437: CsrMatrix *matrix;
2438: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2440: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2441: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2442: matrix->values->assign(a->a, a->a + a->nz);
2443: PetscCallCUDA(WaitForCUDA());
2444: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2445: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2446: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2447: } else {
2448: PetscInt nnz;
2449: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2450: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2451: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2452: delete cusparsestruct->workVector;
2453: delete cusparsestruct->rowoffsets_gpu;
2454: cusparsestruct->workVector = NULL;
2455: cusparsestruct->rowoffsets_gpu = NULL;
2456: try {
2457: if (a->compressedrow.use) {
2458: m = a->compressedrow.nrows;
2459: ii = a->compressedrow.i;
2460: ridx = a->compressedrow.rindex;
2461: } else {
2462: m = A->rmap->n;
2463: ii = a->i;
2464: ridx = NULL;
2465: }
2466: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2467: if (!a->a) {
2468: nnz = ii[m];
2469: both = PETSC_FALSE;
2470: } else nnz = a->nz;
2471: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2473: /* create cusparse matrix */
2474: cusparsestruct->nrows = m;
2475: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2476: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2477: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2478: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2480: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2481: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2482: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2483: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2485: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2486: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2488: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2489: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2490: /* set the matrix */
2491: CsrMatrix *mat = new CsrMatrix;
2492: mat->num_rows = m;
2493: mat->num_cols = A->cmap->n;
2494: mat->num_entries = nnz;
2495: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2496: mat->row_offsets->assign(ii, ii + m + 1);
2497: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2498: mat->column_indices->assign(a->j, a->j + nnz);
2500: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2501: if (a->a) mat->values->assign(a->a, a->a + nnz);
2503: /* assign the pointer */
2504: matstruct->mat = mat;
2505: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2506: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2507: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2508: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2509: PetscCallCUSPARSE(stat);
2510: }
2511: #endif
2512: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2513: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2514: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2515: #else
2516: CsrMatrix *mat = new CsrMatrix;
2517: mat->num_rows = m;
2518: mat->num_cols = A->cmap->n;
2519: mat->num_entries = nnz;
2520: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2521: mat->row_offsets->assign(ii, ii + m + 1);
2523: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2524: mat->column_indices->assign(a->j, a->j + nnz);
2526: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2527: if (a->a) mat->values->assign(a->a, a->a + nnz);
2529: cusparseHybMat_t hybMat;
2530: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2531: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2532: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2533: PetscCallCUSPARSE(stat);
2534: /* assign the pointer */
2535: matstruct->mat = hybMat;
2537: if (mat) {
2538: if (mat->values) delete (THRUSTARRAY *)mat->values;
2539: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2540: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2541: delete (CsrMatrix *)mat;
2542: }
2543: #endif
2544: }
2546: /* assign the compressed row indices */
2547: if (a->compressedrow.use) {
2548: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2549: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2550: matstruct->cprowIndices->assign(ridx, ridx + m);
2551: tmp = m;
2552: } else {
2553: cusparsestruct->workVector = NULL;
2554: matstruct->cprowIndices = NULL;
2555: tmp = 0;
2556: }
2557: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2559: /* assign the pointer */
2560: cusparsestruct->mat = matstruct;
2561: } catch (char *ex) {
2562: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2563: }
2564: PetscCallCUDA(WaitForCUDA());
2565: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2566: cusparsestruct->nonzerostate = A->nonzerostate;
2567: }
2568: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2569: }
2570: PetscFunctionReturn(PETSC_SUCCESS);
2571: }
2573: struct VecCUDAPlusEquals {
2574: template <typename Tuple>
2575: __host__ __device__ void operator()(Tuple t)
2576: {
2577: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2578: }
2579: };
2581: struct VecCUDAEquals {
2582: template <typename Tuple>
2583: __host__ __device__ void operator()(Tuple t)
2584: {
2585: thrust::get<1>(t) = thrust::get<0>(t);
2586: }
2587: };
2589: struct VecCUDAEqualsReverse {
2590: template <typename Tuple>
2591: __host__ __device__ void operator()(Tuple t)
2592: {
2593: thrust::get<0>(t) = thrust::get<1>(t);
2594: }
2595: };
2597: struct MatProductCtx_MatMatCusparse {
2598: PetscBool cisdense;
2599: PetscScalar *Bt;
2600: Mat X;
2601: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2602: PetscLogDouble flops;
2603: CsrMatrix *Bcsr;
2605: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2606: cusparseSpMatDescr_t matSpBDescr;
2607: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2608: cusparseDnMatDescr_t matBDescr;
2609: cusparseDnMatDescr_t matCDescr;
2610: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2611: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2612: void *dBuffer4;
2613: void *dBuffer5;
2614: #endif
2615: size_t mmBufferSize;
2616: void *mmBuffer;
2617: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2618: cusparseSpGEMMDescr_t spgemmDesc;
2619: #endif
2620: };
2622: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(PetscCtxRt data)
2623: {
2624: MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2626: PetscFunctionBegin;
2627: PetscCallCUDA(cudaFree(mmdata->Bt));
2628: delete mmdata->Bcsr;
2629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2630: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2631: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2632: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2633: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2634: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2635: PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2636: PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2637: #endif
2638: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2639: PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2640: #endif
2641: PetscCall(MatDestroy(&mmdata->X));
2642: PetscCall(PetscFree(mmdata));
2643: PetscFunctionReturn(PETSC_SUCCESS);
2644: }
2646: #include <../src/mat/impls/dense/seq/dense.h>
2648: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2649: {
2650: Mat_Product *product = C->product;
2651: Mat A, B;
2652: PetscInt m, n, blda, clda;
2653: PetscBool flg, biscuda;
2654: Mat_SeqAIJCUSPARSE *cusp;
2655: cusparseStatus_t stat;
2656: cusparseOperation_t opA;
2657: const PetscScalar *barray;
2658: PetscScalar *carray;
2659: MatProductCtx_MatMatCusparse *mmdata;
2660: Mat_SeqAIJCUSPARSEMultStruct *mat;
2661: CsrMatrix *csrmat;
2663: PetscFunctionBegin;
2664: MatCheckProduct(C, 1);
2665: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2666: mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2667: A = product->A;
2668: B = product->B;
2669: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2670: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2671: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2672: Instead of silently accepting the wrong answer, I prefer to raise the error */
2673: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2674: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2675: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2676: switch (product->type) {
2677: case MATPRODUCT_AB:
2678: case MATPRODUCT_PtAP:
2679: mat = cusp->mat;
2680: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2681: m = A->rmap->n;
2682: n = B->cmap->n;
2683: break;
2684: case MATPRODUCT_AtB:
2685: if (!A->form_explicit_transpose) {
2686: mat = cusp->mat;
2687: opA = CUSPARSE_OPERATION_TRANSPOSE;
2688: } else {
2689: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2690: mat = cusp->matTranspose;
2691: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2692: }
2693: m = A->cmap->n;
2694: n = B->cmap->n;
2695: break;
2696: case MATPRODUCT_ABt:
2697: case MATPRODUCT_RARt:
2698: mat = cusp->mat;
2699: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2700: m = A->rmap->n;
2701: n = B->rmap->n;
2702: break;
2703: default:
2704: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2705: }
2706: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2707: csrmat = (CsrMatrix *)mat->mat;
2708: /* if the user passed a CPU matrix, copy the data to the GPU */
2709: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2710: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2711: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2713: PetscCall(MatDenseGetLDA(B, &blda));
2714: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2715: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2716: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2717: } else {
2718: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2719: PetscCall(MatDenseGetLDA(C, &clda));
2720: }
2722: PetscCall(PetscLogGpuTimeBegin());
2723: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2724: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2725: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2726: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2727: #else
2728: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2729: #endif
2731: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2732: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2733: size_t mmBufferSize;
2734: if (mmdata->initialized && mmdata->Blda != blda) {
2735: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2736: mmdata->matBDescr = NULL;
2737: }
2738: if (!mmdata->matBDescr) {
2739: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2740: mmdata->Blda = blda;
2741: }
2743: if (mmdata->initialized && mmdata->Clda != clda) {
2744: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2745: mmdata->matCDescr = NULL;
2746: }
2747: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2748: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2749: mmdata->Clda = clda;
2750: }
2752: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2753: if (matADescr) {
2754: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2755: matADescr = NULL;
2756: }
2757: #endif
2759: if (!matADescr) {
2760: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2761: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2762: PetscCallCUSPARSE(stat);
2763: }
2765: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2767: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2768: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2769: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2770: mmdata->mmBufferSize = mmBufferSize;
2771: }
2773: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2774: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2775: #endif
2777: mmdata->initialized = PETSC_TRUE;
2778: } else {
2779: /* to be safe, always update pointers of the mats */
2780: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2781: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2782: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2783: }
2785: /* do cusparseSpMM, which supports transpose on B */
2786: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2787: #else
2788: PetscInt k;
2789: /* cusparseXcsrmm does not support transpose on B */
2790: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2791: cublasHandle_t cublasv2handle;
2792: cublasStatus_t cerr;
2794: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2795: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2796: PetscCallCUBLAS(cerr);
2797: blda = B->cmap->n;
2798: k = B->cmap->n;
2799: } else {
2800: k = B->rmap->n;
2801: }
2803: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2804: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2805: PetscCallCUSPARSE(stat);
2806: #endif
2807: PetscCall(PetscLogGpuTimeEnd());
2808: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2809: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2810: if (product->type == MATPRODUCT_RARt) {
2811: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2812: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2813: } else if (product->type == MATPRODUCT_PtAP) {
2814: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2815: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2816: } else {
2817: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2818: }
2819: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2820: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2821: PetscFunctionReturn(PETSC_SUCCESS);
2822: }
2824: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2825: {
2826: Mat_Product *product = C->product;
2827: Mat A, B;
2828: PetscInt m, n;
2829: PetscBool cisdense, flg;
2830: MatProductCtx_MatMatCusparse *mmdata;
2831: Mat_SeqAIJCUSPARSE *cusp;
2833: PetscFunctionBegin;
2834: MatCheckProduct(C, 1);
2835: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2836: A = product->A;
2837: B = product->B;
2838: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2839: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2840: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2841: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2842: switch (product->type) {
2843: case MATPRODUCT_AB:
2844: m = A->rmap->n;
2845: n = B->cmap->n;
2846: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2847: break;
2848: case MATPRODUCT_AtB:
2849: m = A->cmap->n;
2850: n = B->cmap->n;
2851: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2852: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2853: break;
2854: case MATPRODUCT_ABt:
2855: m = A->rmap->n;
2856: n = B->rmap->n;
2857: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2858: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2859: break;
2860: case MATPRODUCT_PtAP:
2861: m = B->cmap->n;
2862: n = B->cmap->n;
2863: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2864: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2865: break;
2866: case MATPRODUCT_RARt:
2867: m = B->rmap->n;
2868: n = B->rmap->n;
2869: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2870: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2871: break;
2872: default:
2873: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2874: }
2875: PetscCall(MatSetSizes(C, m, n, m, n));
2876: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2877: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2878: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2880: /* product data */
2881: PetscCall(PetscNew(&mmdata));
2882: mmdata->cisdense = cisdense;
2883: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2884: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2885: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2886: #endif
2887: /* for these products we need intermediate storage */
2888: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2889: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2890: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2891: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2892: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2893: } else {
2894: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2895: }
2896: }
2897: C->product->data = mmdata;
2898: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2900: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2901: PetscFunctionReturn(PETSC_SUCCESS);
2902: }
2904: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2905: {
2906: Mat_Product *product = C->product;
2907: Mat A, B;
2908: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2909: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2910: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2911: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2912: PetscBool flg;
2913: cusparseStatus_t stat;
2914: MatProductType ptype;
2915: MatProductCtx_MatMatCusparse *mmdata;
2916: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2917: cusparseSpMatDescr_t BmatSpDescr;
2918: #endif
2919: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2921: PetscFunctionBegin;
2922: MatCheckProduct(C, 1);
2923: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2924: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2925: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2926: mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2927: A = product->A;
2928: B = product->B;
2929: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2930: mmdata->reusesym = PETSC_FALSE;
2931: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2932: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2933: Cmat = Ccusp->mat;
2934: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2935: Ccsr = (CsrMatrix *)Cmat->mat;
2936: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2937: goto finalize;
2938: }
2939: if (!c->nz) goto finalize;
2940: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2941: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2942: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2943: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2944: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2946: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2947: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2948: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2949: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2952: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2953: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2955: ptype = product->type;
2956: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2957: ptype = MATPRODUCT_AB;
2958: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2959: }
2960: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2961: ptype = MATPRODUCT_AB;
2962: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2963: }
2964: switch (ptype) {
2965: case MATPRODUCT_AB:
2966: Amat = Acusp->mat;
2967: Bmat = Bcusp->mat;
2968: break;
2969: case MATPRODUCT_AtB:
2970: Amat = Acusp->matTranspose;
2971: Bmat = Bcusp->mat;
2972: break;
2973: case MATPRODUCT_ABt:
2974: Amat = Acusp->mat;
2975: Bmat = Bcusp->matTranspose;
2976: break;
2977: default:
2978: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2979: }
2980: Cmat = Ccusp->mat;
2981: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2982: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2983: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2984: Acsr = (CsrMatrix *)Amat->mat;
2985: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2986: Ccsr = (CsrMatrix *)Cmat->mat;
2987: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2988: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2989: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2990: PetscCall(PetscLogGpuTimeBegin());
2991: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2992: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2993: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2994: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2995: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2996: PetscCallCUSPARSE(stat);
2997: #else
2998: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2999: PetscCallCUSPARSE(stat);
3000: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3001: PetscCallCUSPARSE(stat);
3002: #endif
3003: #else
3004: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3005: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3006: PetscCallCUSPARSE(stat);
3007: #endif
3008: PetscCall(PetscLogGpuFlops(mmdata->flops));
3009: PetscCallCUDA(WaitForCUDA());
3010: PetscCall(PetscLogGpuTimeEnd());
3011: C->offloadmask = PETSC_OFFLOAD_GPU;
3012: finalize:
3013: /* shorter version of MatAssemblyEnd_SeqAIJ */
3014: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3015: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3016: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3017: c->reallocs = 0;
3018: C->info.mallocs += 0;
3019: C->info.nz_unneeded = 0;
3020: C->assembled = C->was_assembled = PETSC_TRUE;
3021: C->num_ass++;
3022: PetscFunctionReturn(PETSC_SUCCESS);
3023: }
3025: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3026: {
3027: Mat_Product *product = C->product;
3028: Mat A, B;
3029: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3030: Mat_SeqAIJ *a, *b, *c;
3031: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3032: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3033: PetscInt i, j, m, n, k;
3034: PetscBool flg;
3035: cusparseStatus_t stat;
3036: MatProductType ptype;
3037: MatProductCtx_MatMatCusparse *mmdata;
3038: PetscLogDouble flops;
3039: PetscBool biscompressed, ciscompressed;
3040: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3041: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3042: cusparseSpMatDescr_t BmatSpDescr;
3043: #else
3044: int cnz;
3045: #endif
3046: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3048: PetscFunctionBegin;
3049: MatCheckProduct(C, 1);
3050: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3051: A = product->A;
3052: B = product->B;
3053: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3054: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3055: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3056: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3057: a = (Mat_SeqAIJ *)A->data;
3058: b = (Mat_SeqAIJ *)B->data;
3059: /* product data */
3060: PetscCall(PetscNew(&mmdata));
3061: C->product->data = mmdata;
3062: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3064: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3065: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3066: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3067: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3068: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3069: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3071: ptype = product->type;
3072: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3073: ptype = MATPRODUCT_AB;
3074: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3075: }
3076: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3077: ptype = MATPRODUCT_AB;
3078: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3079: }
3080: biscompressed = PETSC_FALSE;
3081: ciscompressed = PETSC_FALSE;
3082: switch (ptype) {
3083: case MATPRODUCT_AB:
3084: m = A->rmap->n;
3085: n = B->cmap->n;
3086: k = A->cmap->n;
3087: Amat = Acusp->mat;
3088: Bmat = Bcusp->mat;
3089: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3090: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3091: break;
3092: case MATPRODUCT_AtB:
3093: m = A->cmap->n;
3094: n = B->cmap->n;
3095: k = A->rmap->n;
3096: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3097: Amat = Acusp->matTranspose;
3098: Bmat = Bcusp->mat;
3099: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3100: break;
3101: case MATPRODUCT_ABt:
3102: m = A->rmap->n;
3103: n = B->rmap->n;
3104: k = A->cmap->n;
3105: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3106: Amat = Acusp->mat;
3107: Bmat = Bcusp->matTranspose;
3108: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3109: break;
3110: default:
3111: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3112: }
3114: /* create cusparse matrix */
3115: PetscCall(MatSetSizes(C, m, n, m, n));
3116: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3117: c = (Mat_SeqAIJ *)C->data;
3118: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3119: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3120: Ccsr = new CsrMatrix;
3122: c->compressedrow.use = ciscompressed;
3123: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3124: c->compressedrow.nrows = a->compressedrow.nrows;
3125: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3126: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3127: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3128: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3129: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3130: } else {
3131: c->compressedrow.nrows = 0;
3132: c->compressedrow.i = NULL;
3133: c->compressedrow.rindex = NULL;
3134: Ccusp->workVector = NULL;
3135: Cmat->cprowIndices = NULL;
3136: }
3137: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3138: Ccusp->mat = Cmat;
3139: Ccusp->mat->mat = Ccsr;
3140: Ccsr->num_rows = Ccusp->nrows;
3141: Ccsr->num_cols = n;
3142: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3143: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3144: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3145: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3146: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3147: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3148: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3149: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3152: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3153: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3154: c->nz = 0;
3155: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3156: Ccsr->values = new THRUSTARRAY(c->nz);
3157: goto finalizesym;
3158: }
3160: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3161: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3162: Acsr = (CsrMatrix *)Amat->mat;
3163: if (!biscompressed) {
3164: Bcsr = (CsrMatrix *)Bmat->mat;
3165: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3166: BmatSpDescr = Bmat->matDescr;
3167: #endif
3168: } else { /* we need to use row offsets for the full matrix */
3169: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3170: Bcsr = new CsrMatrix;
3171: Bcsr->num_rows = B->rmap->n;
3172: Bcsr->num_cols = cBcsr->num_cols;
3173: Bcsr->num_entries = cBcsr->num_entries;
3174: Bcsr->column_indices = cBcsr->column_indices;
3175: Bcsr->values = cBcsr->values;
3176: if (!Bcusp->rowoffsets_gpu) {
3177: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3178: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3179: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3180: }
3181: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3182: mmdata->Bcsr = Bcsr;
3183: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3184: if (Bcsr->num_rows && Bcsr->num_cols) {
3185: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3186: PetscCallCUSPARSE(stat);
3187: }
3188: BmatSpDescr = mmdata->matSpBDescr;
3189: #endif
3190: }
3191: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3192: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3193: /* precompute flops count */
3194: if (ptype == MATPRODUCT_AB) {
3195: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3196: const PetscInt st = a->i[i];
3197: const PetscInt en = a->i[i + 1];
3198: for (j = st; j < en; j++) {
3199: const PetscInt brow = a->j[j];
3200: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3201: }
3202: }
3203: } else if (ptype == MATPRODUCT_AtB) {
3204: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3205: const PetscInt anzi = a->i[i + 1] - a->i[i];
3206: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3207: flops += (2. * anzi) * bnzi;
3208: }
3209: } else { /* TODO */
3210: flops = 0.;
3211: }
3213: mmdata->flops = flops;
3214: PetscCall(PetscLogGpuTimeBegin());
3216: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3217: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3218: // cuda-12.2 requires non-null csrRowOffsets
3219: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3220: PetscCallCUSPARSE(stat);
3221: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3222: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3223: {
3224: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3225: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3226: */
3227: void *dBuffer1 = NULL;
3228: void *dBuffer2 = NULL;
3229: void *dBuffer3 = NULL;
3230: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3231: size_t bufferSize1 = 0;
3232: size_t bufferSize2 = 0;
3233: size_t bufferSize3 = 0;
3234: size_t bufferSize4 = 0;
3235: size_t bufferSize5 = 0;
3237: /* ask bufferSize1 bytes for external memory */
3238: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3239: PetscCallCUSPARSE(stat);
3240: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3241: /* inspect the matrices A and B to understand the memory requirement for the next step */
3242: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3243: PetscCallCUSPARSE(stat);
3245: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3246: PetscCallCUSPARSE(stat);
3247: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3248: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3249: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3250: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3251: PetscCallCUSPARSE(stat);
3252: PetscCallCUDA(cudaFree(dBuffer1));
3253: PetscCallCUDA(cudaFree(dBuffer2));
3255: /* get matrix C non-zero entries C_nnz1 */
3256: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3257: c->nz = (PetscInt)C_nnz1;
3258: /* allocate matrix C */
3259: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3260: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3261: Ccsr->values = new THRUSTARRAY(c->nz);
3262: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3263: /* update matC with the new pointers */
3264: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3265: PetscCallCUSPARSE(stat);
3267: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3268: PetscCallCUSPARSE(stat);
3269: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3270: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3271: PetscCallCUSPARSE(stat);
3272: PetscCallCUDA(cudaFree(dBuffer3));
3273: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3274: PetscCallCUSPARSE(stat);
3275: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3276: }
3277: #else
3278: size_t bufSize2;
3279: /* ask bufferSize bytes for external memory */
3280: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3281: PetscCallCUSPARSE(stat);
3282: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3283: /* inspect the matrices A and B to understand the memory requirement for the next step */
3284: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3285: PetscCallCUSPARSE(stat);
3286: /* ask bufferSize again bytes for external memory */
3287: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3288: PetscCallCUSPARSE(stat);
3289: /* The CUSPARSE documentation is not clear, nor the API
3290: We need both buffers to perform the operations properly!
3291: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3292: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3293: is stored in the descriptor! What a messy API... */
3294: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3295: /* compute the intermediate product of A * B */
3296: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3297: PetscCallCUSPARSE(stat);
3298: /* get matrix C non-zero entries C_nnz1 */
3299: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3300: c->nz = (PetscInt)C_nnz1;
3301: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3302: mmdata->mmBufferSize / 1024));
3303: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3304: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3305: Ccsr->values = new THRUSTARRAY(c->nz);
3306: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3307: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3308: PetscCallCUSPARSE(stat);
3309: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3310: PetscCallCUSPARSE(stat);
3311: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3312: #else
3313: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3314: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3315: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3316: PetscCallCUSPARSE(stat);
3317: c->nz = cnz;
3318: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3319: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3320: Ccsr->values = new THRUSTARRAY(c->nz);
3321: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3323: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3324: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3325: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3326: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3327: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3328: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3329: PetscCallCUSPARSE(stat);
3330: #endif
3331: PetscCall(PetscLogGpuFlops(mmdata->flops));
3332: PetscCall(PetscLogGpuTimeEnd());
3333: finalizesym:
3334: c->free_a = PETSC_TRUE;
3335: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3336: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3337: c->free_ij = PETSC_TRUE;
3338: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3339: PetscInt *d_i = c->i;
3340: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3341: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3342: ii = *Ccsr->row_offsets;
3343: jj = *Ccsr->column_indices;
3344: if (ciscompressed) d_i = c->compressedrow.i;
3345: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3347: } else {
3348: PetscInt *d_i = c->i;
3349: if (ciscompressed) d_i = c->compressedrow.i;
3350: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3352: }
3353: if (ciscompressed) { /* need to expand host row offsets */
3354: PetscInt r = 0;
3355: c->i[0] = 0;
3356: for (k = 0; k < c->compressedrow.nrows; k++) {
3357: const PetscInt next = c->compressedrow.rindex[k];
3358: const PetscInt old = c->compressedrow.i[k];
3359: for (; r < next; r++) c->i[r + 1] = old;
3360: }
3361: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3362: }
3363: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3364: PetscCall(PetscMalloc1(m, &c->ilen));
3365: PetscCall(PetscMalloc1(m, &c->imax));
3366: c->maxnz = c->nz;
3367: c->nonzerorowcnt = 0;
3368: c->rmax = 0;
3369: for (k = 0; k < m; k++) {
3370: const PetscInt nn = c->i[k + 1] - c->i[k];
3371: c->ilen[k] = c->imax[k] = nn;
3372: c->nonzerorowcnt += (PetscInt)!!nn;
3373: c->rmax = PetscMax(c->rmax, nn);
3374: }
3375: PetscCall(PetscMalloc1(c->nz, &c->a));
3376: Ccsr->num_entries = c->nz;
3378: C->nonzerostate++;
3379: PetscCall(PetscLayoutSetUp(C->rmap));
3380: PetscCall(PetscLayoutSetUp(C->cmap));
3381: Ccusp->nonzerostate = C->nonzerostate;
3382: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3383: C->preallocated = PETSC_TRUE;
3384: C->assembled = PETSC_FALSE;
3385: C->was_assembled = PETSC_FALSE;
3386: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3387: mmdata->reusesym = PETSC_TRUE;
3388: C->offloadmask = PETSC_OFFLOAD_GPU;
3389: }
3390: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3391: PetscFunctionReturn(PETSC_SUCCESS);
3392: }
3394: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3396: /* handles sparse or dense B */
3397: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3398: {
3399: Mat_Product *product = mat->product;
3400: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3402: PetscFunctionBegin;
3403: MatCheckProduct(mat, 1);
3404: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3405: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3406: if (product->type == MATPRODUCT_ABC) {
3407: Ciscusp = PETSC_FALSE;
3408: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3409: }
3410: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3411: PetscBool usecpu = PETSC_FALSE;
3412: switch (product->type) {
3413: case MATPRODUCT_AB:
3414: if (product->api_user) {
3415: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3416: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3417: PetscOptionsEnd();
3418: } else {
3419: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3420: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3421: PetscOptionsEnd();
3422: }
3423: break;
3424: case MATPRODUCT_AtB:
3425: if (product->api_user) {
3426: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3427: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3428: PetscOptionsEnd();
3429: } else {
3430: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3431: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3432: PetscOptionsEnd();
3433: }
3434: break;
3435: case MATPRODUCT_PtAP:
3436: if (product->api_user) {
3437: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3438: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3439: PetscOptionsEnd();
3440: } else {
3441: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3442: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3443: PetscOptionsEnd();
3444: }
3445: break;
3446: case MATPRODUCT_RARt:
3447: if (product->api_user) {
3448: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3449: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3450: PetscOptionsEnd();
3451: } else {
3452: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3453: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3454: PetscOptionsEnd();
3455: }
3456: break;
3457: case MATPRODUCT_ABC:
3458: if (product->api_user) {
3459: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3460: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3461: PetscOptionsEnd();
3462: } else {
3463: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3464: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3465: PetscOptionsEnd();
3466: }
3467: break;
3468: default:
3469: break;
3470: }
3471: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3472: }
3473: /* dispatch */
3474: if (isdense) {
3475: switch (product->type) {
3476: case MATPRODUCT_AB:
3477: case MATPRODUCT_AtB:
3478: case MATPRODUCT_ABt:
3479: case MATPRODUCT_PtAP:
3480: case MATPRODUCT_RARt:
3481: if (product->A->boundtocpu) {
3482: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3483: } else {
3484: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3485: }
3486: break;
3487: case MATPRODUCT_ABC:
3488: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3489: break;
3490: default:
3491: break;
3492: }
3493: } else if (Biscusp && Ciscusp) {
3494: switch (product->type) {
3495: case MATPRODUCT_AB:
3496: case MATPRODUCT_AtB:
3497: case MATPRODUCT_ABt:
3498: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3499: break;
3500: case MATPRODUCT_PtAP:
3501: case MATPRODUCT_RARt:
3502: case MATPRODUCT_ABC:
3503: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3504: break;
3505: default:
3506: break;
3507: }
3508: } else { /* fallback for AIJ */
3509: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3510: }
3511: PetscFunctionReturn(PETSC_SUCCESS);
3512: }
3514: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515: {
3516: PetscFunctionBegin;
3517: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3518: PetscFunctionReturn(PETSC_SUCCESS);
3519: }
3521: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522: {
3523: PetscFunctionBegin;
3524: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3525: PetscFunctionReturn(PETSC_SUCCESS);
3526: }
3528: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529: {
3530: PetscFunctionBegin;
3531: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3532: PetscFunctionReturn(PETSC_SUCCESS);
3533: }
3535: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3536: {
3537: PetscFunctionBegin;
3538: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3539: PetscFunctionReturn(PETSC_SUCCESS);
3540: }
3542: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3543: {
3544: PetscFunctionBegin;
3545: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3546: PetscFunctionReturn(PETSC_SUCCESS);
3547: }
3549: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3550: {
3551: int i = blockIdx.x * blockDim.x + threadIdx.x;
3552: if (i < n) y[idx[i]] += x[i];
3553: }
3555: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3556: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3557: {
3558: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3559: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3560: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3561: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3562: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3563: PetscBool compressed;
3564: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3565: PetscInt nx, ny;
3566: #endif
3568: PetscFunctionBegin;
3569: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3570: if (!a->nz) {
3571: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3572: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3573: PetscFunctionReturn(PETSC_SUCCESS);
3574: }
3575: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3576: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3577: if (!trans) {
3578: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3580: } else {
3581: if (herm || !A->form_explicit_transpose) {
3582: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3583: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3584: } else {
3585: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3586: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3587: }
3588: }
3589: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3590: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3592: try {
3593: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3594: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3595: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3597: PetscCall(PetscLogGpuTimeBegin());
3598: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3599: /* z = A x + beta y.
3600: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3601: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3602: */
3603: xptr = xarray;
3604: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3605: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3606: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3607: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3608: allocated to accommodate different uses. So we get the length info directly from mat.
3609: */
3610: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3611: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3612: nx = mat->num_cols; // since y = Ax
3613: ny = mat->num_rows;
3614: }
3615: #endif
3616: } else {
3617: /* z = A^T x + beta y
3618: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3619: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3620: */
3621: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3622: dptr = zarray;
3623: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3624: if (compressed) { /* Scatter x to work vector */
3625: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3627: thrust::for_each(
3628: #if PetscDefined(HAVE_THRUST_ASYNC)
3629: thrust::cuda::par.on(PetscDefaultCudaStream),
3630: #endif
3631: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3632: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3633: }
3634: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3636: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3637: nx = mat->num_rows; // since y = A^T x
3638: ny = mat->num_cols;
3639: }
3640: #endif
3641: }
3643: /* csr_spmv does y = alpha op(A) x + beta y */
3644: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3645: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3646: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3647: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3648: #else
3649: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3650: #endif
3652: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3653: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3654: if (!matDescr) {
3655: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3656: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3657: }
3658: #endif
3660: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3661: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3662: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3663: PetscCallCUSPARSE(
3664: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3665: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3666: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3667: PetscCallCUSPARSE(
3668: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3669: #endif
3670: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3671: } else {
3672: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3673: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3674: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3675: }
3677: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3678: #else
3679: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3680: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3681: #endif
3682: } else {
3683: if (cusparsestruct->nrows) {
3684: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3685: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3686: #else
3687: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3688: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3689: #endif
3690: }
3691: }
3692: PetscCall(PetscLogGpuTimeEnd());
3694: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3695: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3696: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3697: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3698: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3699: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3700: }
3701: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3702: PetscCall(VecSeq_CUDA::Set(zz, 0));
3703: }
3705: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3706: if (compressed) {
3707: PetscCall(PetscLogGpuTimeBegin());
3708: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3709: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710: PetscCall(PetscLogGpuTimeEnd());
3711: }
3712: } else {
3713: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3714: }
3715: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3716: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3717: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3718: } catch (char *ex) {
3719: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3720: }
3721: if (yy) PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3722: else PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3723: PetscFunctionReturn(PETSC_SUCCESS);
3724: }
3726: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3727: {
3728: PetscFunctionBegin;
3729: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3730: PetscFunctionReturn(PETSC_SUCCESS);
3731: }
3733: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3735: __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3736: {
3737: const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3739: if (x < len) {
3740: const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3741: PetscScalar d = 0.0;
3743: for (PetscInt i = 0; i < num_non0_row; i++) {
3744: if (col[i + rowx] == x) {
3745: d = val[i + rowx];
3746: break;
3747: }
3748: }
3749: diag[x] = d;
3750: }
3751: }
3753: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3754: {
3755: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3756: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3757: PetscScalar *darray;
3759: PetscFunctionBegin;
3760: if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3761: PetscInt n = A->rmap->n;
3762: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3764: PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3765: if (n > 0) {
3766: PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3767: GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3768: PetscCallCUDA(cudaPeekAtLastError());
3769: PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3770: }
3771: } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3772: PetscFunctionReturn(PETSC_SUCCESS);
3773: }
3775: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3776: {
3777: PetscFunctionBegin;
3778: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3779: PetscFunctionReturn(PETSC_SUCCESS);
3780: }
3782: /*@
3783: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3785: Collective
3787: Input Parameters:
3788: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3789: . m - number of rows
3790: . n - number of columns
3791: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3792: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3794: Output Parameter:
3795: . A - the matrix
3797: Level: intermediate
3799: Notes:
3800: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3801: calculations. For good matrix assembly performance the user should preallocate the matrix
3802: storage by setting the parameter `nz` (or the array `nnz`).
3804: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3805: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3806: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3808: The AIJ format, also called
3809: compressed row storage, is fully compatible with standard Fortran
3810: storage. That is, the stored row and column indices can begin at
3811: either one (as in Fortran) or zero.
3813: Specify the preallocated storage with either nz or nnz (not both).
3814: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3815: allocation.
3817: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3819: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3820: `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3821: @*/
3822: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3823: {
3824: PetscFunctionBegin;
3825: PetscCall(MatCreate(comm, A));
3826: PetscCall(MatSetSizes(*A, m, n, m, n));
3827: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3828: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3829: PetscFunctionReturn(PETSC_SUCCESS);
3830: }
3832: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3833: {
3834: PetscFunctionBegin;
3835: if (A->factortype == MAT_FACTOR_NONE) {
3836: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3837: } else {
3838: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3839: }
3840: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3841: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3842: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3843: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3844: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3845: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3846: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3847: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3848: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3849: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3850: PetscCall(MatDestroy_SeqAIJ(A));
3851: PetscFunctionReturn(PETSC_SUCCESS);
3852: }
3854: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3855: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3856: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3857: {
3858: PetscFunctionBegin;
3859: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3860: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3861: PetscFunctionReturn(PETSC_SUCCESS);
3862: }
3864: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3865: {
3866: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3867: Mat_SeqAIJCUSPARSE *cy;
3868: Mat_SeqAIJCUSPARSE *cx;
3869: PetscScalar *ay;
3870: const PetscScalar *ax;
3871: CsrMatrix *csry, *csrx;
3873: PetscFunctionBegin;
3874: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3875: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3876: if (X->ops->axpy != Y->ops->axpy) {
3877: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3878: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3879: PetscFunctionReturn(PETSC_SUCCESS);
3880: }
3881: /* if we are here, it means both matrices are bound to GPU */
3882: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3883: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3884: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3885: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3886: csry = (CsrMatrix *)cy->mat->mat;
3887: csrx = (CsrMatrix *)cx->mat->mat;
3888: /* see if we can turn this into a cublas axpy */
3889: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3890: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3891: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3892: if (eq) str = SAME_NONZERO_PATTERN;
3893: }
3894: /* spgeam is buggy with one column */
3895: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3897: if (str == SUBSET_NONZERO_PATTERN) {
3898: PetscScalar b = 1.0;
3899: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900: size_t bufferSize;
3901: void *buffer;
3902: #endif
3904: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3905: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3906: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3907: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3908: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3909: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3910: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3911: PetscCall(PetscLogGpuTimeBegin());
3912: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3913: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3914: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3915: PetscCall(PetscLogGpuTimeEnd());
3916: PetscCallCUDA(cudaFree(buffer));
3917: #else
3918: PetscCall(PetscLogGpuTimeBegin());
3919: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3920: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3921: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3922: PetscCall(PetscLogGpuTimeEnd());
3923: #endif
3924: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3925: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3926: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3927: } else if (str == SAME_NONZERO_PATTERN) {
3928: cublasHandle_t cublasv2handle;
3929: PetscBLASInt one = 1, bnz = 1;
3931: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3932: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3933: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3934: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3935: PetscCall(PetscLogGpuTimeBegin());
3936: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3937: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3938: PetscCall(PetscLogGpuTimeEnd());
3939: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3940: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3941: } else {
3942: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3943: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3944: }
3945: PetscFunctionReturn(PETSC_SUCCESS);
3946: }
3948: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3949: {
3950: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3951: PetscScalar *ay;
3952: cublasHandle_t cublasv2handle;
3953: PetscBLASInt one = 1, bnz = 1;
3955: PetscFunctionBegin;
3956: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3957: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3958: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3959: PetscCall(PetscLogGpuTimeBegin());
3960: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3961: PetscCall(PetscLogGpuFlops(bnz));
3962: PetscCall(PetscLogGpuTimeEnd());
3963: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3964: PetscFunctionReturn(PETSC_SUCCESS);
3965: }
3967: struct DiagonalScaleLeft {
3968: const PetscScalar *lv_ptr;
3969: PetscScalar *val_ptr;
3970: const int *row_ptr;
3971: const PetscInt *cprow_ptr;
3972: __host__ __device__ void operator()(int i) const
3973: {
3974: const int row = cprow_ptr ? (int)cprow_ptr[i] : i;
3975: const PetscScalar s = lv_ptr[row];
3976: for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) val_ptr[j] *= s;
3977: }
3978: };
3980: static PetscErrorCode MatDiagonalScale_SeqAIJCUSPARSE(Mat A, Vec l, Vec r)
3981: {
3982: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)A->data;
3983: CsrMatrix *csr;
3984: const PetscScalar *v;
3985: PetscScalar *av;
3986: PetscInt m, n;
3988: PetscFunctionBegin;
3989: PetscCall(PetscLogGpuTimeBegin());
3990: PetscCall(MatSeqAIJCUSPARSEGetArray(A, &av));
3991: csr = (CsrMatrix *)((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->mat;
3992: if (l) {
3993: const PetscInt *cprow = ((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->cprowIndices ? ((Mat_SeqAIJCUSPARSE *)A->spptr)->mat->cprowIndices->data().get() : NULL;
3994: DiagonalScaleLeft functor;
3996: PetscCall(VecGetLocalSize(l, &m));
3997: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling Vec of wrong length");
3998: PetscCall(VecCUDAGetArrayRead(l, &v));
3999: functor = {v, av, csr->row_offsets->data().get(), cprow};
4000: PetscCallThrust(thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(csr->num_rows), functor));
4001: PetscCall(VecCUDARestoreArrayRead(l, &v));
4002: PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
4003: }
4004: PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &av));
4005: if (r) {
4006: PetscCall(VecGetLocalSize(r, &n));
4007: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling Vec of wrong length");
4008: PetscCall(VecCUDAGetArrayRead(r, &v));
4009: #if CCCL_VERSION >= 3001000
4010: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(v), csr->column_indices->begin()), csr->values->begin(), cuda::std::multiplies<PetscScalar>()));
4011: #else
4012: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), csr->values->begin(), csr->values->end(), thrust::make_permutation_iterator(thrust::device_pointer_cast(v), csr->column_indices->begin()), csr->values->begin(), thrust::multiplies<PetscScalar>()));
4013: #endif
4014: PetscCall(VecCUDARestoreArrayRead(r, &v));
4015: PetscCall(PetscLogGpuFlops(1.0 * aij->nz));
4016: }
4017: PetscCall(PetscLogGpuTimeEnd());
4018: PetscFunctionReturn(PETSC_SUCCESS);
4019: }
4021: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4022: {
4023: PetscBool gpu = PETSC_FALSE;
4024: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4026: PetscFunctionBegin;
4027: if (A->factortype == MAT_FACTOR_NONE) {
4028: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
4029: if (spptr->mat) {
4030: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
4031: if (matrix->values) {
4032: gpu = PETSC_TRUE;
4033: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4034: }
4035: }
4036: if (spptr->matTranspose) {
4037: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
4038: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4039: }
4040: }
4041: if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
4042: else {
4043: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
4044: A->offloadmask = PETSC_OFFLOAD_CPU;
4045: }
4046: PetscFunctionReturn(PETSC_SUCCESS);
4047: }
4049: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
4050: {
4051: PetscFunctionBegin;
4052: *m = PETSC_MEMTYPE_CUDA;
4053: PetscFunctionReturn(PETSC_SUCCESS);
4054: }
4056: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4057: {
4058: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4060: PetscFunctionBegin;
4061: if (A->factortype != MAT_FACTOR_NONE) {
4062: A->boundtocpu = flg;
4063: PetscFunctionReturn(PETSC_SUCCESS);
4064: }
4065: if (flg) {
4066: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4068: A->ops->scale = MatScale_SeqAIJ;
4069: A->ops->diagonalscale = MatDiagonalScale_SeqAIJ;
4070: A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
4071: A->ops->axpy = MatAXPY_SeqAIJ;
4072: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
4073: A->ops->mult = MatMult_SeqAIJ;
4074: A->ops->multadd = MatMultAdd_SeqAIJ;
4075: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
4076: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
4077: A->ops->multhermitiantranspose = NULL;
4078: A->ops->multhermitiantransposeadd = NULL;
4079: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4080: A->ops->getcurrentmemtype = NULL;
4081: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4082: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4083: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4084: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4085: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4086: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4087: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4088: } else {
4089: A->ops->scale = MatScale_SeqAIJCUSPARSE;
4090: A->ops->diagonalscale = MatDiagonalScale_SeqAIJCUSPARSE;
4091: A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE;
4092: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4093: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4094: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4095: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4096: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4097: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4098: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4099: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4100: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4101: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4102: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4103: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4104: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4105: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4106: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4107: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4108: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4110: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4111: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4112: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4113: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4114: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4115: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4116: }
4117: A->boundtocpu = flg;
4118: if (flg && a->inode.size_csr) {
4119: a->inode.use = PETSC_TRUE;
4120: } else {
4121: a->inode.use = PETSC_FALSE;
4122: }
4123: PetscFunctionReturn(PETSC_SUCCESS);
4124: }
4126: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4127: {
4128: Mat B;
4130: PetscFunctionBegin;
4131: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4132: if (reuse == MAT_INITIAL_MATRIX) {
4133: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4134: } else if (reuse == MAT_REUSE_MATRIX) {
4135: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4136: }
4137: B = *newmat;
4139: PetscCall(PetscFree(B->defaultvectype));
4140: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4142: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4143: if (B->factortype == MAT_FACTOR_NONE) {
4144: Mat_SeqAIJCUSPARSE *spptr;
4145: PetscCall(PetscNew(&spptr));
4146: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4147: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4148: spptr->format = MAT_CUSPARSE_CSR;
4149: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4150: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4151: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4152: #else
4153: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4154: #endif
4155: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4156: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4157: #endif
4158: B->spptr = spptr;
4159: } else {
4160: Mat_SeqAIJCUSPARSETriFactors *spptr;
4162: PetscCall(PetscNew(&spptr));
4163: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4164: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4165: B->spptr = spptr;
4166: }
4167: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4168: }
4169: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4170: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4171: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4172: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4173: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4174: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4175: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4177: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4178: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4179: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4180: #if defined(PETSC_HAVE_HYPRE)
4181: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4182: #endif
4183: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4184: PetscFunctionReturn(PETSC_SUCCESS);
4185: }
4187: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4188: {
4189: PetscFunctionBegin;
4190: PetscCall(MatCreate_SeqAIJ(B));
4191: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4192: PetscFunctionReturn(PETSC_SUCCESS);
4193: }
4195: /*MC
4196: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4198: Options Database Keys:
4199: + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4200: . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4201: Other options include ell (ellpack) or hyb (hybrid).
4202: . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4203: - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
4205: Level: beginner
4207: Notes:
4208: These matrices can be in either CSR, ELL, or HYB format.
4210: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4212: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4213: if some integer values passed in do not fit in `int`.
4215: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4216: M*/
4218: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4219: {
4220: PetscFunctionBegin;
4221: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4222: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4223: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4224: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4225: PetscFunctionReturn(PETSC_SUCCESS);
4226: }
4228: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4229: {
4230: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4232: PetscFunctionBegin;
4233: if (cusp) {
4234: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4235: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4236: delete cusp->workVector;
4237: delete cusp->rowoffsets_gpu;
4238: delete cusp->csr2csc_i;
4239: delete cusp->coords;
4240: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4241: PetscCall(PetscFree(mat->spptr));
4242: }
4243: PetscFunctionReturn(PETSC_SUCCESS);
4244: }
4246: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4247: {
4248: PetscFunctionBegin;
4249: if (*mat) {
4250: delete (*mat)->values;
4251: delete (*mat)->column_indices;
4252: delete (*mat)->row_offsets;
4253: delete *mat;
4254: *mat = 0;
4255: }
4256: PetscFunctionReturn(PETSC_SUCCESS);
4257: }
4259: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4260: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4261: {
4262: PetscFunctionBegin;
4263: if (*trifactor) {
4264: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4265: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4266: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4267: PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4268: PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4269: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4270: PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4271: #endif
4272: PetscCall(PetscFree(*trifactor));
4273: }
4274: PetscFunctionReturn(PETSC_SUCCESS);
4275: }
4276: #endif
4278: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4279: {
4280: CsrMatrix *mat;
4282: PetscFunctionBegin;
4283: if (*matstruct) {
4284: if ((*matstruct)->mat) {
4285: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4286: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4287: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4288: #else
4289: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4290: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4291: #endif
4292: } else {
4293: mat = (CsrMatrix *)(*matstruct)->mat;
4294: PetscCall(CsrMatrix_Destroy(&mat));
4295: }
4296: }
4297: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4298: delete (*matstruct)->cprowIndices;
4299: PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4300: PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4301: PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4303: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4304: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4305: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4307: for (int i = 0; i < 3; i++) {
4308: if (mdata->cuSpMV[i].initialized) {
4309: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4310: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4311: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4312: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4313: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4314: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4315: #endif
4316: }
4317: }
4318: #endif
4319: delete *matstruct;
4320: *matstruct = NULL;
4321: }
4322: PetscFunctionReturn(PETSC_SUCCESS);
4323: }
4325: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4326: {
4327: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4329: PetscFunctionBegin;
4330: if (fs) {
4331: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4332: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4333: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4334: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4335: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4336: delete fs->workVector;
4337: fs->workVector = NULL;
4338: #endif
4339: delete fs->rpermIndices;
4340: delete fs->cpermIndices;
4341: fs->rpermIndices = NULL;
4342: fs->cpermIndices = NULL;
4343: fs->init_dev_prop = PETSC_FALSE;
4344: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4345: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4346: PetscCallCUDA(cudaFree(fs->csrColIdx));
4347: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4348: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4349: PetscCallCUDA(cudaFree(fs->csrVal));
4350: PetscCallCUDA(cudaFree(fs->diag));
4351: PetscCallCUDA(cudaFree(fs->X));
4352: PetscCallCUDA(cudaFree(fs->Y));
4353: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4354: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4355: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4356: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4357: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4358: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4359: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4360: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4361: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4362: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4363: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4364: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4365: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4366: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4367: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4368: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4369: PetscCall(PetscFree(fs->csrRowPtr_h));
4370: PetscCall(PetscFree(fs->csrVal_h));
4371: PetscCall(PetscFree(fs->diag_h));
4372: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4373: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4374: #endif
4375: }
4376: PetscFunctionReturn(PETSC_SUCCESS);
4377: }
4379: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4380: {
4381: PetscFunctionBegin;
4382: if (*trifactors) {
4383: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4384: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4385: PetscCall(PetscFree(*trifactors));
4386: }
4387: PetscFunctionReturn(PETSC_SUCCESS);
4388: }
4390: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4391: {
4392: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4394: PetscFunctionBegin;
4395: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4396: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4397: if (destroy) {
4398: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4399: delete cusp->csr2csc_i;
4400: cusp->csr2csc_i = NULL;
4401: }
4402: A->transupdated = PETSC_FALSE;
4403: PetscFunctionReturn(PETSC_SUCCESS);
4404: }
4406: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(PetscCtxRt ctx)
4407: {
4408: MatCOOStruct_SeqAIJ *coo = *(MatCOOStruct_SeqAIJ **)ctx;
4410: PetscFunctionBegin;
4411: PetscCallCUDA(cudaFree(coo->perm));
4412: PetscCallCUDA(cudaFree(coo->jmap));
4413: PetscCall(PetscFree(coo));
4414: PetscFunctionReturn(PETSC_SUCCESS);
4415: }
4417: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4418: {
4419: PetscBool dev_ij = PETSC_FALSE;
4420: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4421: PetscInt *i, *j;
4422: PetscContainer container_h;
4423: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4425: PetscFunctionBegin;
4426: PetscCall(PetscGetMemType(coo_i, &mtype));
4427: if (PetscMemTypeDevice(mtype)) {
4428: dev_ij = PETSC_TRUE;
4429: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4430: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4431: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4432: } else {
4433: i = coo_i;
4434: j = coo_j;
4435: }
4437: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4438: if (dev_ij) PetscCall(PetscFree2(i, j));
4439: mat->offloadmask = PETSC_OFFLOAD_CPU;
4440: // Create the GPU memory
4441: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4443: // Copy the COO struct to device
4444: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4445: PetscCall(PetscContainerGetPointer(container_h, &coo_h));
4446: PetscCall(PetscMalloc1(1, &coo_d));
4447: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4448: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4449: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4450: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4451: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4453: // Put the COO struct in a container and then attach that to the matrix
4454: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4455: PetscFunctionReturn(PETSC_SUCCESS);
4456: }
4458: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4459: {
4460: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4461: const PetscCount grid_size = gridDim.x * blockDim.x;
4462: for (; i < nnz; i += grid_size) {
4463: PetscScalar sum = 0.0;
4464: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4465: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4466: }
4467: }
4469: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4470: {
4471: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4472: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4473: PetscCount Annz = seq->nz;
4474: PetscMemType memtype;
4475: const PetscScalar *v1 = v;
4476: PetscScalar *Aa;
4477: PetscContainer container;
4478: MatCOOStruct_SeqAIJ *coo;
4480: PetscFunctionBegin;
4481: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4483: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4484: PetscCall(PetscContainerGetPointer(container, &coo));
4486: PetscCall(PetscGetMemType(v, &memtype));
4487: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4488: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4489: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4490: }
4492: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4493: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4495: PetscCall(PetscLogGpuTimeBegin());
4496: if (Annz) {
4497: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4498: PetscCallCUDA(cudaPeekAtLastError());
4499: }
4500: PetscCall(PetscLogGpuTimeEnd());
4502: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4503: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4505: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4506: PetscFunctionReturn(PETSC_SUCCESS);
4507: }
4509: /*@C
4510: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4512: Not Collective
4514: Input Parameters:
4515: + A - the matrix
4516: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4518: Output Parameters:
4519: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4520: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4522: Level: developer
4524: Note:
4525: When compressed is true, the CSR structure does not contain empty rows
4527: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4528: @*/
4529: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4530: {
4531: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4532: CsrMatrix *csr;
4533: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4535: PetscFunctionBegin;
4537: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4538: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4539: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4540: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4541: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4542: csr = (CsrMatrix *)cusp->mat->mat;
4543: if (i) {
4544: if (!compressed && a->compressedrow.use) { /* need full row offset */
4545: if (!cusp->rowoffsets_gpu) {
4546: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4547: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4548: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4549: }
4550: *i = cusp->rowoffsets_gpu->data().get();
4551: } else *i = csr->row_offsets->data().get();
4552: }
4553: if (j) *j = csr->column_indices->data().get();
4554: PetscFunctionReturn(PETSC_SUCCESS);
4555: }
4557: /*@C
4558: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4560: Not Collective
4562: Input Parameters:
4563: + A - the matrix
4564: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4565: . i - the CSR row pointers
4566: - j - the CSR column indices
4568: Level: developer
4570: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4571: @*/
4572: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4573: {
4574: PetscFunctionBegin;
4576: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4577: if (i) *i = NULL;
4578: if (j) *j = NULL;
4579: (void)compressed;
4580: PetscFunctionReturn(PETSC_SUCCESS);
4581: }
4583: /*@C
4584: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4586: Not Collective
4588: Input Parameter:
4589: . A - a `MATSEQAIJCUSPARSE` matrix
4591: Output Parameter:
4592: . a - pointer to the device data
4594: Level: developer
4596: Note:
4597: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4599: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4600: @*/
4601: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4602: {
4603: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4604: CsrMatrix *csr;
4606: PetscFunctionBegin;
4608: PetscAssertPointer(a, 2);
4609: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4610: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4611: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4612: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4613: csr = (CsrMatrix *)cusp->mat->mat;
4614: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4615: *a = csr->values->data().get();
4616: PetscFunctionReturn(PETSC_SUCCESS);
4617: }
4619: /*@C
4620: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4622: Not Collective
4624: Input Parameters:
4625: + A - a `MATSEQAIJCUSPARSE` matrix
4626: - a - pointer to the device data
4628: Level: developer
4630: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4631: @*/
4632: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4633: {
4634: PetscFunctionBegin;
4636: PetscAssertPointer(a, 2);
4637: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4638: *a = NULL;
4639: PetscFunctionReturn(PETSC_SUCCESS);
4640: }
4642: /*@C
4643: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4645: Not Collective
4647: Input Parameter:
4648: . A - a `MATSEQAIJCUSPARSE` matrix
4650: Output Parameter:
4651: . a - pointer to the device data
4653: Level: developer
4655: Note:
4656: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4658: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4659: @*/
4660: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4661: {
4662: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4663: CsrMatrix *csr;
4665: PetscFunctionBegin;
4667: PetscAssertPointer(a, 2);
4668: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4669: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4670: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4671: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4672: csr = (CsrMatrix *)cusp->mat->mat;
4673: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4674: *a = csr->values->data().get();
4675: A->offloadmask = PETSC_OFFLOAD_GPU;
4676: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4677: PetscFunctionReturn(PETSC_SUCCESS);
4678: }
4679: /*@C
4680: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4682: Not Collective
4684: Input Parameters:
4685: + A - a `MATSEQAIJCUSPARSE` matrix
4686: - a - pointer to the device data
4688: Level: developer
4690: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4691: @*/
4692: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4693: {
4694: PetscFunctionBegin;
4696: PetscAssertPointer(a, 2);
4697: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4698: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4699: *a = NULL;
4700: PetscFunctionReturn(PETSC_SUCCESS);
4701: }
4703: /*@C
4704: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4706: Not Collective
4708: Input Parameter:
4709: . A - a `MATSEQAIJCUSPARSE` matrix
4711: Output Parameter:
4712: . a - pointer to the device data
4714: Level: developer
4716: Note:
4717: Does not trigger any host to device copies.
4719: It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4721: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4722: @*/
4723: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4724: {
4725: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4726: CsrMatrix *csr;
4728: PetscFunctionBegin;
4730: PetscAssertPointer(a, 2);
4731: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4732: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4734: csr = (CsrMatrix *)cusp->mat->mat;
4735: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4736: *a = csr->values->data().get();
4737: A->offloadmask = PETSC_OFFLOAD_GPU;
4738: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4739: PetscFunctionReturn(PETSC_SUCCESS);
4740: }
4742: /*@C
4743: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4745: Not Collective
4747: Input Parameters:
4748: + A - a `MATSEQAIJCUSPARSE` matrix
4749: - a - pointer to the device data
4751: Level: developer
4753: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4754: @*/
4755: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4756: {
4757: PetscFunctionBegin;
4759: PetscAssertPointer(a, 2);
4760: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4761: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4762: *a = NULL;
4763: PetscFunctionReturn(PETSC_SUCCESS);
4764: }
4766: struct IJCompare4 {
4767: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4768: {
4769: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4770: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4771: return false;
4772: }
4773: };
4775: struct Shift {
4776: int _shift;
4778: Shift(int shift) : _shift(shift) { }
4779: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4780: };
4782: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4783: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4784: {
4785: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4786: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4787: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4788: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4789: PetscInt Annz, Bnnz;
4790: cusparseStatus_t stat;
4791: PetscInt i, m, n, zero = 0;
4793: PetscFunctionBegin;
4796: PetscAssertPointer(C, 4);
4797: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4798: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4799: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4800: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4801: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4802: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4803: if (reuse == MAT_INITIAL_MATRIX) {
4804: m = A->rmap->n;
4805: n = A->cmap->n + B->cmap->n;
4806: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4807: PetscCall(MatSetSizes(*C, m, n, m, n));
4808: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4809: c = (Mat_SeqAIJ *)(*C)->data;
4810: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4811: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4812: Ccsr = new CsrMatrix;
4813: Cmat->cprowIndices = NULL;
4814: c->compressedrow.use = PETSC_FALSE;
4815: c->compressedrow.nrows = 0;
4816: c->compressedrow.i = NULL;
4817: c->compressedrow.rindex = NULL;
4818: Ccusp->workVector = NULL;
4819: Ccusp->nrows = m;
4820: Ccusp->mat = Cmat;
4821: Ccusp->mat->mat = Ccsr;
4822: Ccsr->num_rows = m;
4823: Ccsr->num_cols = n;
4824: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4825: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4826: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4827: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4828: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4829: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4830: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4831: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4832: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4833: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4834: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4835: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4836: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4838: Acsr = (CsrMatrix *)Acusp->mat->mat;
4839: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4840: Annz = (PetscInt)Acsr->column_indices->size();
4841: Bnnz = (PetscInt)Bcsr->column_indices->size();
4842: c->nz = Annz + Bnnz;
4843: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4844: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4845: Ccsr->values = new THRUSTARRAY(c->nz);
4846: Ccsr->num_entries = c->nz;
4847: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4848: if (c->nz) {
4849: auto Acoo = new THRUSTINTARRAY32(Annz);
4850: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4851: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4852: THRUSTINTARRAY32 *Aroff, *Broff;
4854: if (a->compressedrow.use) { /* need full row offset */
4855: if (!Acusp->rowoffsets_gpu) {
4856: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4857: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4858: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4859: }
4860: Aroff = Acusp->rowoffsets_gpu;
4861: } else Aroff = Acsr->row_offsets;
4862: if (b->compressedrow.use) { /* need full row offset */
4863: if (!Bcusp->rowoffsets_gpu) {
4864: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4865: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4866: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4867: }
4868: Broff = Bcusp->rowoffsets_gpu;
4869: } else Broff = Bcsr->row_offsets;
4870: PetscCall(PetscLogGpuTimeBegin());
4871: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4872: PetscCallCUSPARSE(stat);
4873: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4874: PetscCallCUSPARSE(stat);
4875: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4876: auto Aperm = thrust::make_constant_iterator(1);
4877: auto Bperm = thrust::make_constant_iterator(0);
4878: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4879: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4880: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4881: #else
4882: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4883: auto Bcib = Bcsr->column_indices->begin();
4884: auto Bcie = Bcsr->column_indices->end();
4885: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4886: #endif
4887: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4888: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4889: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4890: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4891: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4892: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4893: auto p1 = Ccusp->coords->begin();
4894: auto p2 = Ccusp->coords->begin();
4895: #if CCCL_VERSION >= 3001000
4896: cuda::std::advance(p2, Annz);
4897: #else
4898: thrust::advance(p2, Annz);
4899: #endif
4900: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4901: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4902: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4903: #endif
4904: auto cci = thrust::make_counting_iterator(zero);
4905: auto cce = thrust::make_counting_iterator(c->nz);
4906: #if 0 //Errors on SUMMIT cuda 11.1.0
4907: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4908: #else
4909: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4910: auto pred = thrust::identity<int>();
4911: #else
4912: auto pred = cuda::std::identity();
4913: #endif
4914: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4915: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4916: #endif
4917: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4918: PetscCallCUSPARSE(stat);
4919: PetscCall(PetscLogGpuTimeEnd());
4920: delete wPerm;
4921: delete Acoo;
4922: delete Bcoo;
4923: delete Ccoo;
4924: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4925: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4926: PetscCallCUSPARSE(stat);
4927: #endif
4928: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4929: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4930: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4931: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4932: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4933: CsrMatrix *CcsrT = new CsrMatrix;
4934: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4935: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4937: (*C)->form_explicit_transpose = PETSC_TRUE;
4938: (*C)->transupdated = PETSC_TRUE;
4939: Ccusp->rowoffsets_gpu = NULL;
4940: CmatT->cprowIndices = NULL;
4941: CmatT->mat = CcsrT;
4942: CcsrT->num_rows = n;
4943: CcsrT->num_cols = m;
4944: CcsrT->num_entries = c->nz;
4946: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4947: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4948: CcsrT->values = new THRUSTARRAY(c->nz);
4950: PetscCall(PetscLogGpuTimeBegin());
4951: auto rT = CcsrT->row_offsets->begin();
4952: if (AT) {
4953: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4954: #if CCCL_VERSION >= 3001000
4955: cuda::std::advance(rT, -1);
4956: #else
4957: thrust::advance(rT, -1);
4958: #endif
4959: }
4960: if (BT) {
4961: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4962: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4963: thrust::copy(titb, tite, rT);
4964: }
4965: auto cT = CcsrT->column_indices->begin();
4966: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4967: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4968: auto vT = CcsrT->values->begin();
4969: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4970: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4971: PetscCall(PetscLogGpuTimeEnd());
4973: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4974: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4975: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4976: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4977: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4978: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4979: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4980: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4981: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4982: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4983: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4984: PetscCallCUSPARSE(stat);
4985: #endif
4986: Ccusp->matTranspose = CmatT;
4987: }
4988: }
4990: c->free_a = PETSC_TRUE;
4991: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4992: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4993: c->free_ij = PETSC_TRUE;
4994: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4995: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4996: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4997: ii = *Ccsr->row_offsets;
4998: jj = *Ccsr->column_indices;
4999: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5000: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5001: } else {
5002: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5003: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5004: }
5005: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
5006: PetscCall(PetscMalloc1(m, &c->ilen));
5007: PetscCall(PetscMalloc1(m, &c->imax));
5008: c->maxnz = c->nz;
5009: c->nonzerorowcnt = 0;
5010: c->rmax = 0;
5011: for (i = 0; i < m; i++) {
5012: const PetscInt nn = c->i[i + 1] - c->i[i];
5013: c->ilen[i] = c->imax[i] = nn;
5014: c->nonzerorowcnt += (PetscInt)!!nn;
5015: c->rmax = PetscMax(c->rmax, nn);
5016: }
5017: PetscCall(PetscMalloc1(c->nz, &c->a));
5018: (*C)->nonzerostate++;
5019: PetscCall(PetscLayoutSetUp((*C)->rmap));
5020: PetscCall(PetscLayoutSetUp((*C)->cmap));
5021: Ccusp->nonzerostate = (*C)->nonzerostate;
5022: (*C)->preallocated = PETSC_TRUE;
5023: } else {
5024: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5025: c = (Mat_SeqAIJ *)(*C)->data;
5026: if (c->nz) {
5027: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
5028: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
5029: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
5030: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
5031: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5032: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5033: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5034: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5035: Acsr = (CsrMatrix *)Acusp->mat->mat;
5036: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5037: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5038: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5039: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5040: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5041: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
5042: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
5043: auto pmid = Ccusp->coords->begin();
5044: #if CCCL_VERSION >= 3001000
5045: cuda::std::advance(pmid, Acsr->num_entries);
5046: #else
5047: thrust::advance(pmid, Acsr->num_entries);
5048: #endif
5049: PetscCall(PetscLogGpuTimeBegin());
5050: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
5051: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5052: thrust::for_each(zibait, zieait, VecCUDAEquals());
5053: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5054: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
5055: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
5056: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
5057: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5058: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5059: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5060: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5061: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5062: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5063: auto vT = CcsrT->values->begin();
5064: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5065: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5066: (*C)->transupdated = PETSC_TRUE;
5067: }
5068: PetscCall(PetscLogGpuTimeEnd());
5069: }
5070: }
5071: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5072: (*C)->assembled = PETSC_TRUE;
5073: (*C)->was_assembled = PETSC_FALSE;
5074: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
5075: PetscFunctionReturn(PETSC_SUCCESS);
5076: }
5078: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5079: {
5080: bool dmem;
5081: const PetscScalar *av;
5083: PetscFunctionBegin;
5084: dmem = isCudaMem(v);
5085: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5086: if (n && idx) {
5087: THRUSTINTARRAY widx(n);
5088: widx.assign(idx, idx + n);
5089: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5091: THRUSTARRAY *w = NULL;
5092: thrust::device_ptr<PetscScalar> dv;
5093: if (dmem) {
5094: dv = thrust::device_pointer_cast(v);
5095: } else {
5096: w = new THRUSTARRAY(n);
5097: dv = w->data();
5098: }
5099: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5101: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5102: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5103: thrust::for_each(zibit, zieit, VecCUDAEquals());
5104: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5105: delete w;
5106: } else {
5107: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5108: }
5109: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5110: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5111: PetscFunctionReturn(PETSC_SUCCESS);
5112: }