Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #endif
19: #include <thrust/iterator/constant_iterator.h>
20: #include <thrust/remove.h>
21: #include <thrust/sort.h>
22: #include <thrust/unique.h>
23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24: #include <cuda/std/functional>
25: #endif
27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29: /*
30: The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32: */
33: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36: #endif
38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48: #endif
49: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
60: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
65: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
68: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
72: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73: {
74: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
76: PetscFunctionBegin;
77: switch (op) {
78: case MAT_CUSPARSE_MULT:
79: cusparsestruct->format = format;
80: break;
81: case MAT_CUSPARSE_ALL:
82: cusparsestruct->format = format;
83: break;
84: default:
85: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86: }
87: PetscFunctionReturn(PETSC_SUCCESS);
88: }
90: /*@
91: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92: operation. Only the `MatMult()` operation can use different GPU storage formats
94: Not Collective
96: Input Parameters:
97: + A - Matrix of type `MATSEQAIJCUSPARSE`
98: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
102: Level: intermediate
104: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105: @*/
106: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107: {
108: PetscFunctionBegin;
110: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115: {
116: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
118: PetscFunctionBegin;
119: cusparsestruct->use_cpu_solve = use_cpu;
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /*@
124: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
126: Input Parameters:
127: + A - Matrix of type `MATSEQAIJCUSPARSE`
128: - use_cpu - set flag for using the built-in CPU `MatSolve()`
130: Level: intermediate
132: Note:
133: The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
137: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138: @*/
139: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140: {
141: PetscFunctionBegin;
143: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144: PetscFunctionReturn(PETSC_SUCCESS);
145: }
147: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148: {
149: PetscFunctionBegin;
150: switch (op) {
151: case MAT_FORM_EXPLICIT_TRANSPOSE:
152: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154: A->form_explicit_transpose = flg;
155: break;
156: default:
157: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158: break;
159: }
160: PetscFunctionReturn(PETSC_SUCCESS);
161: }
163: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164: {
165: MatCUSPARSEStorageFormat format;
166: PetscBool flg;
167: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
169: PetscFunctionBegin;
170: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171: if (A->factortype == MAT_FACTOR_NONE) {
172: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
175: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184: #else
185: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186: #endif
187: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
190: PetscCall(
191: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193: #endif
194: }
195: PetscOptionsHeadEnd();
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201: {
202: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
203: PetscInt m = A->rmap->n;
204: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
206: const MatScalar *Aa = a->a;
207: PetscInt *Mi, *Mj, Mnz;
208: PetscScalar *Ma;
210: PetscFunctionBegin;
211: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
212: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
213: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
214: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
215: Mnz = (Ai[m] - Ai[0]) + (adiag[0] - adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
216: PetscCall(PetscMalloc1(m + 1, &Mi));
217: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
218: PetscCall(PetscMalloc1(Mnz, &Ma));
219: Mi[0] = 0;
220: for (PetscInt i = 0; i < m; i++) {
221: PetscInt llen = Ai[i + 1] - Ai[i];
222: PetscInt ulen = adiag[i] - adiag[i + 1];
223: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
224: Mj[Mi[i] + llen] = i; // diagonal entry
225: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
226: Mi[i + 1] = Mi[i] + llen + ulen;
227: }
228: // Copy M (L,U) from host to device
229: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
230: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
231: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
232: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
233: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
235: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
236: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
237: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
238: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
239: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
240: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
241: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
242: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
244: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
245: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
246: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
248: fillMode = CUSPARSE_FILL_MODE_UPPER;
249: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
250: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
251: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
252: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
254: // Allocate work vectors in SpSv
255: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
256: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
258: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
259: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
261: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
262: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
263: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
264: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
265: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
266: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
267: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
269: // Record for reuse
270: fs->csrRowPtr_h = Mi;
271: fs->csrVal_h = Ma;
272: PetscCall(PetscFree(Mj));
273: }
274: // Copy the value
275: Mi = fs->csrRowPtr_h;
276: Ma = fs->csrVal_h;
277: Mnz = Mi[m];
278: for (PetscInt i = 0; i < m; i++) {
279: PetscInt llen = Ai[i + 1] - Ai[i];
280: PetscInt ulen = adiag[i] - adiag[i + 1];
281: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
282: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[adiag[i]]; // recover the diagonal entry
283: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
284: }
285: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
287: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
288: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
289: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
290: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
292: } else
293: #endif
294: {
295: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
296: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
298: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
299: fs->updatedSpSVAnalysis = PETSC_TRUE;
300: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
301: }
302: }
303: PetscFunctionReturn(PETSC_SUCCESS);
304: }
305: #else
306: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
307: {
308: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
309: PetscInt n = A->rmap->n;
310: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
311: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
312: const PetscInt *ai = a->i, *aj = a->j, *vi;
313: const MatScalar *aa = a->a, *v;
314: PetscInt *AiLo, *AjLo;
315: PetscInt i, nz, nzLower, offset, rowOffset;
317: PetscFunctionBegin;
318: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
319: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
320: try {
321: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
322: nzLower = n + ai[n] - ai[1];
323: if (!loTriFactor) {
324: PetscScalar *AALo;
326: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
328: /* Allocate Space for the lower triangular matrix */
329: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
330: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
332: /* Fill the lower triangular matrix */
333: AiLo[0] = (PetscInt)0;
334: AiLo[n] = nzLower;
335: AjLo[0] = (PetscInt)0;
336: AALo[0] = (MatScalar)1.0;
337: v = aa;
338: vi = aj;
339: offset = 1;
340: rowOffset = 1;
341: for (i = 1; i < n; i++) {
342: nz = ai[i + 1] - ai[i];
343: /* additional 1 for the term on the diagonal */
344: AiLo[i] = rowOffset;
345: rowOffset += nz + 1;
347: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
348: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
350: offset += nz;
351: AjLo[offset] = (PetscInt)i;
352: AALo[offset] = (MatScalar)1.0;
353: offset += 1;
355: v += nz;
356: vi += nz;
357: }
359: /* allocate space for the triangular factor information */
360: PetscCall(PetscNew(&loTriFactor));
361: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
362: /* Create the matrix description */
363: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
364: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
365: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
366: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
367: #else
368: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
369: #endif
370: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
371: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
373: /* set the operation */
374: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
376: /* set the matrix */
377: loTriFactor->csrMat = new CsrMatrix;
378: loTriFactor->csrMat->num_rows = n;
379: loTriFactor->csrMat->num_cols = n;
380: loTriFactor->csrMat->num_entries = nzLower;
382: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
383: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
385: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
386: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
388: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
389: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
391: /* Create the solve analysis information */
392: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
393: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
394: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
395: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
396: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
397: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
398: #endif
400: /* perform the solve analysis */
401: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
402: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
403: PetscCallCUDA(WaitForCUDA());
404: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
406: /* assign the pointer */
407: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
408: loTriFactor->AA_h = AALo;
409: PetscCallCUDA(cudaFreeHost(AiLo));
410: PetscCallCUDA(cudaFreeHost(AjLo));
411: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
412: } else { /* update values only */
413: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
414: /* Fill the lower triangular matrix */
415: loTriFactor->AA_h[0] = 1.0;
416: v = aa;
417: vi = aj;
418: offset = 1;
419: for (i = 1; i < n; i++) {
420: nz = ai[i + 1] - ai[i];
421: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
422: offset += nz;
423: loTriFactor->AA_h[offset] = 1.0;
424: offset += 1;
425: v += nz;
426: }
427: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
428: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
429: }
430: } catch (char *ex) {
431: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
432: }
433: }
434: PetscFunctionReturn(PETSC_SUCCESS);
435: }
437: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
438: {
439: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
440: PetscInt n = A->rmap->n;
441: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
442: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
443: const PetscInt *aj = a->j, *adiag, *vi;
444: const MatScalar *aa = a->a, *v;
445: PetscInt *AiUp, *AjUp;
446: PetscInt i, nz, nzUpper, offset;
448: PetscFunctionBegin;
449: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
450: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
451: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
452: try {
453: /* next, figure out the number of nonzeros in the upper triangular matrix. */
454: nzUpper = adiag[0] - adiag[n];
455: if (!upTriFactor) {
456: PetscScalar *AAUp;
458: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
460: /* Allocate Space for the upper triangular matrix */
461: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
462: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
464: /* Fill the upper triangular matrix */
465: AiUp[0] = (PetscInt)0;
466: AiUp[n] = nzUpper;
467: offset = nzUpper;
468: for (i = n - 1; i >= 0; i--) {
469: v = aa + adiag[i + 1] + 1;
470: vi = aj + adiag[i + 1] + 1;
472: /* number of elements NOT on the diagonal */
473: nz = adiag[i] - adiag[i + 1] - 1;
475: /* decrement the offset */
476: offset -= (nz + 1);
478: /* first, set the diagonal elements */
479: AjUp[offset] = (PetscInt)i;
480: AAUp[offset] = (MatScalar)1. / v[nz];
481: AiUp[i] = AiUp[i + 1] - (nz + 1);
483: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
484: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
485: }
487: /* allocate space for the triangular factor information */
488: PetscCall(PetscNew(&upTriFactor));
489: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
491: /* Create the matrix description */
492: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
493: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
494: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
495: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
496: #else
497: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
498: #endif
499: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
500: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
502: /* set the operation */
503: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
505: /* set the matrix */
506: upTriFactor->csrMat = new CsrMatrix;
507: upTriFactor->csrMat->num_rows = n;
508: upTriFactor->csrMat->num_cols = n;
509: upTriFactor->csrMat->num_entries = nzUpper;
511: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
512: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
514: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
515: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
517: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
518: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
520: /* Create the solve analysis information */
521: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
522: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
523: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
524: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
525: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
526: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
527: #endif
529: /* perform the solve analysis */
530: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
531: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
533: PetscCallCUDA(WaitForCUDA());
534: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536: /* assign the pointer */
537: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
538: upTriFactor->AA_h = AAUp;
539: PetscCallCUDA(cudaFreeHost(AiUp));
540: PetscCallCUDA(cudaFreeHost(AjUp));
541: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
542: } else {
543: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
544: /* Fill the upper triangular matrix */
545: offset = nzUpper;
546: for (i = n - 1; i >= 0; i--) {
547: v = aa + adiag[i + 1] + 1;
549: /* number of elements NOT on the diagonal */
550: nz = adiag[i] - adiag[i + 1] - 1;
552: /* decrement the offset */
553: offset -= (nz + 1);
555: /* first, set the diagonal elements */
556: upTriFactor->AA_h[offset] = 1. / v[nz];
557: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
558: }
559: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
560: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
561: }
562: } catch (char *ex) {
563: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
564: }
565: }
566: PetscFunctionReturn(PETSC_SUCCESS);
567: }
568: #endif
570: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
571: {
572: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
573: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
574: IS isrow = a->row, isicol = a->icol;
575: PetscBool row_identity, col_identity;
576: PetscInt n = A->rmap->n;
578: PetscFunctionBegin;
579: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
580: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
581: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
582: #else
583: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
584: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
585: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
586: #endif
588: cusparseTriFactors->nnz = a->nz;
590: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
591: /* lower triangular indices */
592: PetscCall(ISIdentity(isrow, &row_identity));
593: if (!row_identity && !cusparseTriFactors->rpermIndices) {
594: const PetscInt *r;
596: PetscCall(ISGetIndices(isrow, &r));
597: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
598: cusparseTriFactors->rpermIndices->assign(r, r + n);
599: PetscCall(ISRestoreIndices(isrow, &r));
600: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
601: }
603: /* upper triangular indices */
604: PetscCall(ISIdentity(isicol, &col_identity));
605: if (!col_identity && !cusparseTriFactors->cpermIndices) {
606: const PetscInt *c;
608: PetscCall(ISGetIndices(isicol, &c));
609: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
610: cusparseTriFactors->cpermIndices->assign(c, c + n);
611: PetscCall(ISRestoreIndices(isicol, &c));
612: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
613: }
614: PetscFunctionReturn(PETSC_SUCCESS);
615: }
617: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
618: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
619: {
620: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
621: PetscInt m = A->rmap->n;
622: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
623: const PetscInt *Ai = a->i, *Aj = a->j, *adiag;
624: const MatScalar *Aa = a->a;
625: PetscInt *Mj, Mnz;
626: PetscScalar *Ma, *D;
628: PetscFunctionBegin;
629: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
630: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
631: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
632: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
633: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
634: Mnz = Ai[m]; // Unz (with the unit diagonal)
635: PetscCall(PetscMalloc1(Mnz, &Ma));
636: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
637: PetscCall(PetscMalloc1(m, &D)); // the diagonal
638: for (PetscInt i = 0; i < m; i++) {
639: PetscInt ulen = Ai[i + 1] - Ai[i];
640: Mj[Ai[i]] = i; // diagonal entry
641: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
642: }
643: // Copy M (U) from host to device
644: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
645: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
646: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
647: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
648: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
649: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
651: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
652: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
653: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
654: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
655: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
656: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
657: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
658: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
660: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
661: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
662: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
664: // Allocate work vectors in SpSv
665: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
666: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
668: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
669: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
671: // Query buffer sizes for SpSV and then allocate buffers
672: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
673: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
674: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
676: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
677: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
678: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
680: // Record for reuse
681: fs->csrVal_h = Ma;
682: fs->diag_h = D;
683: PetscCall(PetscFree(Mj));
684: }
685: // Copy the value
686: Ma = fs->csrVal_h;
687: D = fs->diag_h;
688: Mnz = Ai[m];
689: for (PetscInt i = 0; i < m; i++) {
690: D[i] = Aa[adiag[i]]; // actually Aa[adiag[i]] is the inverse of the diagonal
691: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
692: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
693: }
694: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
695: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
697: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
698: if (fs->updatedSpSVAnalysis) {
699: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
700: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
701: } else
702: #endif
703: {
704: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
705: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
706: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
707: fs->updatedSpSVAnalysis = PETSC_TRUE;
708: }
709: }
710: PetscFunctionReturn(PETSC_SUCCESS);
711: }
713: // Solve Ut D U x = b
714: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
715: {
716: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
717: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
718: const PetscScalar *barray;
719: PetscScalar *xarray;
720: thrust::device_ptr<const PetscScalar> bGPU;
721: thrust::device_ptr<PetscScalar> xGPU;
722: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
723: PetscInt m = A->rmap->n;
725: PetscFunctionBegin;
726: PetscCall(PetscLogGpuTimeBegin());
727: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
728: PetscCall(VecCUDAGetArrayRead(b, &barray));
729: xGPU = thrust::device_pointer_cast(xarray);
730: bGPU = thrust::device_pointer_cast(barray);
732: // Reorder b with the row permutation if needed, and wrap the result in fs->X
733: if (fs->rpermIndices) {
734: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
735: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
736: } else {
737: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
738: }
740: // Solve Ut Y = X
741: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
742: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
744: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
745: // It is basically a vector element-wise multiplication, but cublas does not have it!
746: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
748: // Solve U X = Y
749: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
750: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
751: } else {
752: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
753: }
754: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
756: // Reorder X with the column permutation if needed, and put the result back to x
757: if (fs->cpermIndices) {
758: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
759: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
760: }
762: PetscCall(VecCUDARestoreArrayRead(b, &barray));
763: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
764: PetscCall(PetscLogGpuTimeEnd());
765: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
766: PetscFunctionReturn(PETSC_SUCCESS);
767: }
768: #else
769: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
770: {
771: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
772: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
773: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
774: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
775: PetscInt *AiUp, *AjUp;
776: PetscScalar *AAUp;
777: PetscScalar *AALo;
778: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
779: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
780: const PetscInt *ai = b->i, *aj = b->j, *vj;
781: const MatScalar *aa = b->a, *v;
783: PetscFunctionBegin;
784: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
785: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
786: try {
787: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
788: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
789: if (!upTriFactor && !loTriFactor) {
790: /* Allocate Space for the upper triangular matrix */
791: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
792: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
794: /* Fill the upper triangular matrix */
795: AiUp[0] = (PetscInt)0;
796: AiUp[n] = nzUpper;
797: offset = 0;
798: for (i = 0; i < n; i++) {
799: /* set the pointers */
800: v = aa + ai[i];
801: vj = aj + ai[i];
802: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
804: /* first, set the diagonal elements */
805: AjUp[offset] = (PetscInt)i;
806: AAUp[offset] = (MatScalar)1.0 / v[nz];
807: AiUp[i] = offset;
808: AALo[offset] = (MatScalar)1.0 / v[nz];
810: offset += 1;
811: if (nz > 0) {
812: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
813: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
814: for (j = offset; j < offset + nz; j++) {
815: AAUp[j] = -AAUp[j];
816: AALo[j] = AAUp[j] / v[nz];
817: }
818: offset += nz;
819: }
820: }
822: /* allocate space for the triangular factor information */
823: PetscCall(PetscNew(&upTriFactor));
824: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826: /* Create the matrix description */
827: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
828: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
829: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
830: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
831: #else
832: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
833: #endif
834: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
835: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
837: /* set the matrix */
838: upTriFactor->csrMat = new CsrMatrix;
839: upTriFactor->csrMat->num_rows = A->rmap->n;
840: upTriFactor->csrMat->num_cols = A->cmap->n;
841: upTriFactor->csrMat->num_entries = a->nz;
843: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
844: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
846: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
849: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
852: /* set the operation */
853: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855: /* Create the solve analysis information */
856: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
857: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
858: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
859: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
860: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
861: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
862: #endif
864: /* perform the solve analysis */
865: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
866: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
868: PetscCallCUDA(WaitForCUDA());
869: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
871: /* assign the pointer */
872: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
874: /* allocate space for the triangular factor information */
875: PetscCall(PetscNew(&loTriFactor));
876: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
878: /* Create the matrix description */
879: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
880: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
881: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
882: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
883: #else
884: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
885: #endif
886: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
887: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
889: /* set the operation */
890: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
892: /* set the matrix */
893: loTriFactor->csrMat = new CsrMatrix;
894: loTriFactor->csrMat->num_rows = A->rmap->n;
895: loTriFactor->csrMat->num_cols = A->cmap->n;
896: loTriFactor->csrMat->num_entries = a->nz;
898: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
899: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
901: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
902: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
904: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
905: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
907: /* Create the solve analysis information */
908: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
909: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
910: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
911: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
912: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
913: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
914: #endif
916: /* perform the solve analysis */
917: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
918: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
920: PetscCallCUDA(WaitForCUDA());
921: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
923: /* assign the pointer */
924: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
926: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
927: PetscCallCUDA(cudaFreeHost(AiUp));
928: PetscCallCUDA(cudaFreeHost(AjUp));
929: } else {
930: /* Fill the upper triangular matrix */
931: offset = 0;
932: for (i = 0; i < n; i++) {
933: /* set the pointers */
934: v = aa + ai[i];
935: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
937: /* first, set the diagonal elements */
938: AAUp[offset] = 1.0 / v[nz];
939: AALo[offset] = 1.0 / v[nz];
941: offset += 1;
942: if (nz > 0) {
943: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
944: for (j = offset; j < offset + nz; j++) {
945: AAUp[j] = -AAUp[j];
946: AALo[j] = AAUp[j] / v[nz];
947: }
948: offset += nz;
949: }
950: }
951: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
952: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
953: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
954: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
955: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
956: }
957: PetscCallCUDA(cudaFreeHost(AAUp));
958: PetscCallCUDA(cudaFreeHost(AALo));
959: } catch (char *ex) {
960: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
961: }
962: }
963: PetscFunctionReturn(PETSC_SUCCESS);
964: }
965: #endif
967: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
968: {
969: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
970: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
971: IS ip = a->row;
972: PetscBool perm_identity;
973: PetscInt n = A->rmap->n;
975: PetscFunctionBegin;
976: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
978: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
979: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
980: #else
981: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
982: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
983: #endif
984: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
986: A->offloadmask = PETSC_OFFLOAD_BOTH;
988: /* lower triangular indices */
989: PetscCall(ISIdentity(ip, &perm_identity));
990: if (!perm_identity) {
991: IS iip;
992: const PetscInt *irip, *rip;
994: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
995: PetscCall(ISGetIndices(iip, &irip));
996: PetscCall(ISGetIndices(ip, &rip));
997: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
998: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
999: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1000: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1001: PetscCall(ISRestoreIndices(iip, &irip));
1002: PetscCall(ISDestroy(&iip));
1003: PetscCall(ISRestoreIndices(ip, &rip));
1004: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1005: }
1006: PetscFunctionReturn(PETSC_SUCCESS);
1007: }
1009: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1010: {
1011: PetscFunctionBegin;
1012: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1013: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1014: B->offloadmask = PETSC_OFFLOAD_CPU;
1016: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1017: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1018: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019: #else
1020: /* determine which version of MatSolve needs to be used. */
1021: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1022: IS ip = b->row;
1023: PetscBool perm_identity;
1025: PetscCall(ISIdentity(ip, &perm_identity));
1026: if (perm_identity) {
1027: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1028: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1029: } else {
1030: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1031: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1032: }
1033: #endif
1034: B->ops->matsolve = NULL;
1035: B->ops->matsolvetranspose = NULL;
1037: /* get the triangular factors */
1038: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1039: PetscFunctionReturn(PETSC_SUCCESS);
1040: }
1042: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1043: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1044: {
1045: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1046: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1047: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1048: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1049: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1050: cusparseIndexBase_t indexBase;
1051: cusparseMatrixType_t matrixType;
1052: cusparseFillMode_t fillMode;
1053: cusparseDiagType_t diagType;
1055: PetscFunctionBegin;
1056: /* allocate space for the transpose of the lower triangular factor */
1057: PetscCall(PetscNew(&loTriFactorT));
1058: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1060: /* set the matrix descriptors of the lower triangular factor */
1061: matrixType = cusparseGetMatType(loTriFactor->descr);
1062: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1063: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1064: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1066: /* Create the matrix description */
1067: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1068: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1069: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1070: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1071: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1073: /* set the operation */
1074: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1076: /* allocate GPU space for the CSC of the lower triangular factor*/
1077: loTriFactorT->csrMat = new CsrMatrix;
1078: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1079: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1080: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1081: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1082: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1083: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1085: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1086: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1087: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1088: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1089: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1090: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1091: #endif
1093: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1094: {
1095: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1096: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1097: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1098: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1099: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1100: #else
1101: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1102: #endif
1103: PetscCallCUSPARSE(stat);
1104: }
1106: PetscCallCUDA(WaitForCUDA());
1107: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1109: /* Create the solve analysis information */
1110: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1111: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1112: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1113: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1114: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1115: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1116: #endif
1118: /* perform the solve analysis */
1119: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1120: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1122: PetscCallCUDA(WaitForCUDA());
1123: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1125: /* assign the pointer */
1126: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1128: /*********************************************/
1129: /* Now the Transpose of the Upper Tri Factor */
1130: /*********************************************/
1132: /* allocate space for the transpose of the upper triangular factor */
1133: PetscCall(PetscNew(&upTriFactorT));
1134: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1136: /* set the matrix descriptors of the upper triangular factor */
1137: matrixType = cusparseGetMatType(upTriFactor->descr);
1138: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1139: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1140: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1142: /* Create the matrix description */
1143: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1144: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1145: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1146: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1147: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1149: /* set the operation */
1150: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1152: /* allocate GPU space for the CSC of the upper triangular factor*/
1153: upTriFactorT->csrMat = new CsrMatrix;
1154: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1155: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1156: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1157: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1158: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1159: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1161: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1162: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1163: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1164: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1166: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1167: #endif
1169: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1170: {
1171: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1172: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1173: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1174: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1175: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1176: #else
1177: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1178: #endif
1179: PetscCallCUSPARSE(stat);
1180: }
1182: PetscCallCUDA(WaitForCUDA());
1183: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1185: /* Create the solve analysis information */
1186: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1187: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1188: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1189: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1190: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1191: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1192: #endif
1194: /* perform the solve analysis */
1195: /* christ, would it have killed you to put this stuff in a function????????? */
1196: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1197: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1199: PetscCallCUDA(WaitForCUDA());
1200: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1202: /* assign the pointer */
1203: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1204: PetscFunctionReturn(PETSC_SUCCESS);
1205: }
1206: #endif
1208: struct PetscScalarToPetscInt {
1209: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1210: };
1212: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1213: {
1214: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1215: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1216: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1217: cusparseStatus_t stat;
1218: cusparseIndexBase_t indexBase;
1220: PetscFunctionBegin;
1221: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1222: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1223: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1224: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1225: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1226: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1227: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1228: PetscCall(PetscLogGpuTimeBegin());
1229: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1230: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1233: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1235: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1237: /* set alpha and beta */
1238: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1239: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1240: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1241: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1242: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1243: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1245: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246: CsrMatrix *matrixT = new CsrMatrix;
1247: matstructT->mat = matrixT;
1248: matrixT->num_rows = A->cmap->n;
1249: matrixT->num_cols = A->rmap->n;
1250: matrixT->num_entries = a->nz;
1251: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1252: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253: matrixT->values = new THRUSTARRAY(a->nz);
1255: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1256: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1258: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1259: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1260: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1261: indexBase, cusparse_scalartype);
1262: PetscCallCUSPARSE(stat);
1263: #else
1264: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1265: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1267: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1268: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1269: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1270: */
1271: if (matrixT->num_entries) {
1272: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1273: PetscCallCUSPARSE(stat);
1275: } else {
1276: matstructT->matDescr = NULL;
1277: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1278: }
1279: #endif
1280: #endif
1281: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1282: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1283: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1284: #else
1285: CsrMatrix *temp = new CsrMatrix;
1286: CsrMatrix *tempT = new CsrMatrix;
1287: /* First convert HYB to CSR */
1288: temp->num_rows = A->rmap->n;
1289: temp->num_cols = A->cmap->n;
1290: temp->num_entries = a->nz;
1291: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1292: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1293: temp->values = new THRUSTARRAY(a->nz);
1295: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1296: PetscCallCUSPARSE(stat);
1298: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299: tempT->num_rows = A->rmap->n;
1300: tempT->num_cols = A->cmap->n;
1301: tempT->num_entries = a->nz;
1302: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1303: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304: tempT->values = new THRUSTARRAY(a->nz);
1306: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1307: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1308: PetscCallCUSPARSE(stat);
1310: /* Last, convert CSC to HYB */
1311: cusparseHybMat_t hybMat;
1312: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1313: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1314: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1315: PetscCallCUSPARSE(stat);
1317: /* assign the pointer */
1318: matstructT->mat = hybMat;
1319: A->transupdated = PETSC_TRUE;
1320: /* delete temporaries */
1321: if (tempT) {
1322: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1323: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1324: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1325: delete (CsrMatrix *)tempT;
1326: }
1327: if (temp) {
1328: if (temp->values) delete (THRUSTARRAY *)temp->values;
1329: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1330: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1331: delete (CsrMatrix *)temp;
1332: }
1333: #endif
1334: }
1335: }
1336: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1337: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1338: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1339: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1340: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1341: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1342: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1343: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1344: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1345: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1346: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1347: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1348: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1349: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1350: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1351: }
1352: if (!cusparsestruct->csr2csc_i) {
1353: THRUSTARRAY csr2csc_a(matrix->num_entries);
1354: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1356: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1357: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1358: void *csr2cscBuffer;
1359: size_t csr2cscBufferSize;
1360: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1361: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1362: PetscCallCUSPARSE(stat);
1363: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1364: #endif
1366: if (matrix->num_entries) {
1367: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1368: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1369: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1371: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1372: should be filled with indexBase. So I just take a shortcut here.
1373: */
1374: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1375: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1376: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1377: PetscCallCUSPARSE(stat);
1378: #else
1379: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1380: PetscCallCUSPARSE(stat);
1381: #endif
1382: } else {
1383: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1384: }
1386: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1387: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1388: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1389: PetscCallCUDA(cudaFree(csr2cscBuffer));
1390: #endif
1391: }
1392: PetscCallThrust(
1393: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1394: }
1395: PetscCall(PetscLogGpuTimeEnd());
1396: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1397: /* the compressed row indices is not used for matTranspose */
1398: matstructT->cprowIndices = NULL;
1399: /* assign the pointer */
1400: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1401: A->transupdated = PETSC_TRUE;
1402: PetscFunctionReturn(PETSC_SUCCESS);
1403: }
1405: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1406: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1407: {
1408: const PetscScalar *barray;
1409: PetscScalar *xarray;
1410: thrust::device_ptr<const PetscScalar> bGPU;
1411: thrust::device_ptr<PetscScalar> xGPU;
1412: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1413: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1414: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1415: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1416: PetscInt m = A->rmap->n;
1418: PetscFunctionBegin;
1419: PetscCall(PetscLogGpuTimeBegin());
1420: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1421: PetscCall(VecCUDAGetArrayRead(b, &barray));
1422: xGPU = thrust::device_pointer_cast(xarray);
1423: bGPU = thrust::device_pointer_cast(barray);
1425: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1426: if (fs->rpermIndices) {
1427: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1428: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1429: } else {
1430: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1431: }
1433: // Solve L Y = X
1434: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1435: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1436: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1438: // Solve U X = Y
1439: if (fs->cpermIndices) {
1440: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1441: } else {
1442: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1443: }
1444: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1446: // Reorder X with the column permutation if needed, and put the result back to x
1447: if (fs->cpermIndices) {
1448: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1449: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1450: }
1451: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1452: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1453: PetscCall(PetscLogGpuTimeEnd());
1454: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1455: PetscFunctionReturn(PETSC_SUCCESS);
1456: }
1458: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1459: {
1460: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1461: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1462: const PetscScalar *barray;
1463: PetscScalar *xarray;
1464: thrust::device_ptr<const PetscScalar> bGPU;
1465: thrust::device_ptr<PetscScalar> xGPU;
1466: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1467: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1468: PetscInt m = A->rmap->n;
1470: PetscFunctionBegin;
1471: PetscCall(PetscLogGpuTimeBegin());
1472: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1473: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1474: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1475: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1477: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1478: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1479: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1480: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1481: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1482: }
1484: if (!fs->updatedTransposeSpSVAnalysis) {
1485: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1487: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1488: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1489: }
1491: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1492: PetscCall(VecCUDAGetArrayRead(b, &barray));
1493: xGPU = thrust::device_pointer_cast(xarray);
1494: bGPU = thrust::device_pointer_cast(barray);
1496: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1497: if (fs->rpermIndices) {
1498: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1499: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1500: } else {
1501: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1502: }
1504: // Solve Ut Y = X
1505: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1506: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1508: // Solve Lt X = Y
1509: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1510: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1511: } else {
1512: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1513: }
1514: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1516: // Reorder X with the column permutation if needed, and put the result back to x
1517: if (fs->cpermIndices) {
1518: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1519: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1520: }
1522: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1523: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1524: PetscCall(PetscLogGpuTimeEnd());
1525: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1526: PetscFunctionReturn(PETSC_SUCCESS);
1527: }
1528: #else
1529: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1530: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1531: {
1532: PetscInt n = xx->map->n;
1533: const PetscScalar *barray;
1534: PetscScalar *xarray;
1535: thrust::device_ptr<const PetscScalar> bGPU;
1536: thrust::device_ptr<PetscScalar> xGPU;
1537: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1538: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1539: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1540: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1542: PetscFunctionBegin;
1543: /* Analyze the matrix and create the transpose ... on the fly */
1544: if (!loTriFactorT && !upTriFactorT) {
1545: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1546: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1547: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1548: }
1550: /* Get the GPU pointers */
1551: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1552: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1553: xGPU = thrust::device_pointer_cast(xarray);
1554: bGPU = thrust::device_pointer_cast(barray);
1556: PetscCall(PetscLogGpuTimeBegin());
1557: /* First, reorder with the row permutation */
1558: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1560: /* First, solve U */
1561: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1562: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1564: /* Then, solve L */
1565: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1566: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1568: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1569: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1571: /* Copy the temporary to the full solution. */
1572: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1574: /* restore */
1575: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1576: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1577: PetscCall(PetscLogGpuTimeEnd());
1578: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1579: PetscFunctionReturn(PETSC_SUCCESS);
1580: }
1582: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1583: {
1584: const PetscScalar *barray;
1585: PetscScalar *xarray;
1586: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1587: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1588: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1589: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1591: PetscFunctionBegin;
1592: /* Analyze the matrix and create the transpose ... on the fly */
1593: if (!loTriFactorT && !upTriFactorT) {
1594: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1595: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1596: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1597: }
1599: /* Get the GPU pointers */
1600: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1601: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1603: PetscCall(PetscLogGpuTimeBegin());
1604: /* First, solve U */
1605: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1606: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1608: /* Then, solve L */
1609: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1610: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1612: /* restore */
1613: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1614: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1615: PetscCall(PetscLogGpuTimeEnd());
1616: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1617: PetscFunctionReturn(PETSC_SUCCESS);
1618: }
1620: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1621: {
1622: const PetscScalar *barray;
1623: PetscScalar *xarray;
1624: thrust::device_ptr<const PetscScalar> bGPU;
1625: thrust::device_ptr<PetscScalar> xGPU;
1626: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1627: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1628: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1629: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1631: PetscFunctionBegin;
1632: /* Get the GPU pointers */
1633: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1634: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1635: xGPU = thrust::device_pointer_cast(xarray);
1636: bGPU = thrust::device_pointer_cast(barray);
1638: PetscCall(PetscLogGpuTimeBegin());
1639: /* First, reorder with the row permutation */
1640: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1642: /* Next, solve L */
1643: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1644: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1646: /* Then, solve U */
1647: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1648: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1650: /* Last, reorder with the column permutation */
1651: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1653: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1654: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1655: PetscCall(PetscLogGpuTimeEnd());
1656: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1657: PetscFunctionReturn(PETSC_SUCCESS);
1658: }
1660: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1661: {
1662: const PetscScalar *barray;
1663: PetscScalar *xarray;
1664: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1665: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1666: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1667: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1669: PetscFunctionBegin;
1670: /* Get the GPU pointers */
1671: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1672: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1674: PetscCall(PetscLogGpuTimeBegin());
1675: /* First, solve L */
1676: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1677: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1679: /* Next, solve U */
1680: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1681: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1683: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1684: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1685: PetscCall(PetscLogGpuTimeEnd());
1686: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1687: PetscFunctionReturn(PETSC_SUCCESS);
1688: }
1689: #endif
1691: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1692: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1693: {
1694: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1695: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1696: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1697: CsrMatrix *Acsr;
1698: PetscInt m, nz;
1699: PetscBool flg;
1701: PetscFunctionBegin;
1702: if (PetscDefined(USE_DEBUG)) {
1703: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1704: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1705: }
1707: /* Copy A's value to fact */
1708: m = fact->rmap->n;
1709: nz = aij->nz;
1710: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1711: Acsr = (CsrMatrix *)Acusp->mat->mat;
1712: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1714: PetscCall(PetscLogGpuTimeBegin());
1715: /* Factorize fact inplace */
1716: if (m)
1717: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1718: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1719: if (PetscDefined(USE_DEBUG)) {
1720: int numerical_zero;
1721: cusparseStatus_t status;
1722: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1723: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1724: }
1726: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1727: if (fs->updatedSpSVAnalysis) {
1728: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1729: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730: } else
1731: #endif
1732: {
1733: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1734: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1735: */
1736: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1738: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1740: fs->updatedSpSVAnalysis = PETSC_TRUE;
1741: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1742: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1743: }
1745: fact->offloadmask = PETSC_OFFLOAD_GPU;
1746: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1747: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1748: fact->ops->matsolve = NULL;
1749: fact->ops->matsolvetranspose = NULL;
1750: PetscCall(PetscLogGpuTimeEnd());
1751: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1752: PetscFunctionReturn(PETSC_SUCCESS);
1753: }
1755: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1756: {
1757: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1758: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1759: PetscInt m, nz;
1761: PetscFunctionBegin;
1762: if (PetscDefined(USE_DEBUG)) {
1763: PetscBool flg, diagDense;
1765: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1766: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1767: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1768: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
1769: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing a diagonal entry");
1770: }
1772: /* Free the old stale stuff */
1773: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1775: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1776: but they will not be used. Allocate them just for easy debugging.
1777: */
1778: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1780: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1781: fact->factortype = MAT_FACTOR_ILU;
1782: fact->info.factor_mallocs = 0;
1783: fact->info.fill_ratio_given = info->fill;
1784: fact->info.fill_ratio_needed = 1.0;
1786: aij->row = NULL;
1787: aij->col = NULL;
1789: /* ====================================================================== */
1790: /* Copy A's i, j to fact and also allocate the value array of fact. */
1791: /* We'll do in-place factorization on fact */
1792: /* ====================================================================== */
1793: const int *Ai, *Aj;
1795: m = fact->rmap->n;
1796: nz = aij->nz;
1798: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1799: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1800: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1801: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1802: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1803: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1805: /* ====================================================================== */
1806: /* Create descriptors for M, L, U */
1807: /* ====================================================================== */
1808: cusparseFillMode_t fillMode;
1809: cusparseDiagType_t diagType;
1811: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1812: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1813: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1815: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1816: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1817: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1818: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1819: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1820: */
1821: fillMode = CUSPARSE_FILL_MODE_LOWER;
1822: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1823: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1824: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1825: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1827: fillMode = CUSPARSE_FILL_MODE_UPPER;
1828: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1829: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1830: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1831: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1833: /* ========================================================================= */
1834: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1835: /* ========================================================================= */
1836: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1837: if (m)
1838: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1839: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1841: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1842: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1844: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1845: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1847: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1848: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1850: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1851: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1853: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1854: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1855: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1856: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1857: */
1858: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1859: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1860: fs->spsvBuffer_L = fs->factBuffer_M;
1861: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1862: } else {
1863: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1864: fs->spsvBuffer_U = fs->factBuffer_M;
1865: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1866: }
1868: /* ========================================================================== */
1869: /* Perform analysis of ilu0 on M, SpSv on L and U */
1870: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1871: /* ========================================================================== */
1872: int structural_zero;
1873: cusparseStatus_t status;
1875: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1876: if (m)
1877: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1878: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1879: if (PetscDefined(USE_DEBUG)) {
1880: /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1881: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1882: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1883: }
1885: /* Estimate FLOPs of the numeric factorization */
1886: {
1887: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1888: PetscInt *Ai, nzRow, nzLeft;
1889: const PetscInt *adiag;
1890: PetscLogDouble flops = 0.0;
1892: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
1893: Ai = Aseq->i;
1894: for (PetscInt i = 0; i < m; i++) {
1895: if (Ai[i] < adiag[i] && adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1896: nzRow = Ai[i + 1] - Ai[i];
1897: nzLeft = adiag[i] - Ai[i];
1898: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1899: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1900: */
1901: nzLeft = (nzRow - 1) / 2;
1902: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1903: }
1904: }
1905: fs->numericFactFlops = flops;
1906: }
1907: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1908: PetscFunctionReturn(PETSC_SUCCESS);
1909: }
1911: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1912: {
1913: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1914: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1915: const PetscScalar *barray;
1916: PetscScalar *xarray;
1918: PetscFunctionBegin;
1919: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1920: PetscCall(VecCUDAGetArrayRead(b, &barray));
1921: PetscCall(PetscLogGpuTimeBegin());
1923: /* Solve L*y = b */
1924: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1925: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1926: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1927: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1929: /* Solve Lt*x = y */
1930: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1931: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1932: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1934: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1935: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1937: PetscCall(PetscLogGpuTimeEnd());
1938: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1939: PetscFunctionReturn(PETSC_SUCCESS);
1940: }
1942: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1943: {
1944: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1945: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1946: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1947: CsrMatrix *Acsr;
1948: PetscInt m, nz;
1949: PetscBool flg;
1951: PetscFunctionBegin;
1952: if (PetscDefined(USE_DEBUG)) {
1953: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1954: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1955: }
1957: /* Copy A's value to fact */
1958: m = fact->rmap->n;
1959: nz = aij->nz;
1960: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1961: Acsr = (CsrMatrix *)Acusp->mat->mat;
1962: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1964: /* Factorize fact inplace */
1965: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1966: csric02() only takes the lower triangular part of matrix A to perform factorization.
1967: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1968: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1969: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1970: */
1971: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1972: if (PetscDefined(USE_DEBUG)) {
1973: int numerical_zero;
1974: cusparseStatus_t status;
1975: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1976: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1977: }
1979: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1980: if (fs->updatedSpSVAnalysis) {
1981: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1982: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1983: } else
1984: #endif
1985: {
1986: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1988: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1989: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1990: */
1991: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1992: fs->updatedSpSVAnalysis = PETSC_TRUE;
1993: }
1995: fact->offloadmask = PETSC_OFFLOAD_GPU;
1996: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1997: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1998: fact->ops->matsolve = NULL;
1999: fact->ops->matsolvetranspose = NULL;
2000: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2001: PetscFunctionReturn(PETSC_SUCCESS);
2002: }
2004: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2005: {
2006: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2007: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2008: PetscInt m, nz;
2010: PetscFunctionBegin;
2011: if (PetscDefined(USE_DEBUG)) {
2012: PetscBool flg, diagDense;
2014: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2015: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2016: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2017: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, NULL, &diagDense));
2018: PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entries");
2019: }
2021: /* Free the old stale stuff */
2022: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2024: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2025: but they will not be used. Allocate them just for easy debugging.
2026: */
2027: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2029: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2030: fact->factortype = MAT_FACTOR_ICC;
2031: fact->info.factor_mallocs = 0;
2032: fact->info.fill_ratio_given = info->fill;
2033: fact->info.fill_ratio_needed = 1.0;
2035: aij->row = NULL;
2036: aij->col = NULL;
2038: /* ====================================================================== */
2039: /* Copy A's i, j to fact and also allocate the value array of fact. */
2040: /* We'll do in-place factorization on fact */
2041: /* ====================================================================== */
2042: const int *Ai, *Aj;
2044: m = fact->rmap->n;
2045: nz = aij->nz;
2047: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2048: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2049: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2050: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2051: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2052: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2054: /* ====================================================================== */
2055: /* Create mat descriptors for M, L */
2056: /* ====================================================================== */
2057: cusparseFillMode_t fillMode;
2058: cusparseDiagType_t diagType;
2060: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2061: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2062: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2064: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2065: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2066: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2067: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2068: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2069: */
2070: fillMode = CUSPARSE_FILL_MODE_LOWER;
2071: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2072: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2073: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2074: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2076: /* ========================================================================= */
2077: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2078: /* ========================================================================= */
2079: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2080: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2082: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2083: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2085: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2086: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2088: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2089: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2091: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2092: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2094: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2095: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2096: */
2097: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2098: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2099: fs->spsvBuffer_L = fs->factBuffer_M;
2100: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2101: } else {
2102: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2103: fs->spsvBuffer_Lt = fs->factBuffer_M;
2104: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2105: }
2107: /* ========================================================================== */
2108: /* Perform analysis of ic0 on M */
2109: /* The lower triangular part of M has the same sparsity pattern as L */
2110: /* ========================================================================== */
2111: int structural_zero;
2112: cusparseStatus_t status;
2114: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2115: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2116: if (PetscDefined(USE_DEBUG)) {
2117: /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2118: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2119: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2120: }
2122: /* Estimate FLOPs of the numeric factorization */
2123: {
2124: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2125: PetscInt *Ai, nzRow, nzLeft;
2126: PetscLogDouble flops = 0.0;
2128: Ai = Aseq->i;
2129: for (PetscInt i = 0; i < m; i++) {
2130: nzRow = Ai[i + 1] - Ai[i];
2131: if (nzRow > 1) {
2132: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2133: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2134: */
2135: nzLeft = (nzRow - 1) / 2;
2136: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2137: }
2138: }
2139: fs->numericFactFlops = flops;
2140: }
2141: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2142: PetscFunctionReturn(PETSC_SUCCESS);
2143: }
2144: #endif
2146: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2147: {
2148: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2149: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2151: PetscFunctionBegin;
2152: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2153: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2154: B->offloadmask = PETSC_OFFLOAD_CPU;
2156: if (!cusparsestruct->use_cpu_solve) {
2157: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2158: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2159: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2160: #else
2161: /* determine which version of MatSolve needs to be used. */
2162: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2163: IS isrow = b->row, iscol = b->col;
2164: PetscBool row_identity, col_identity;
2166: PetscCall(ISIdentity(isrow, &row_identity));
2167: PetscCall(ISIdentity(iscol, &col_identity));
2168: if (row_identity && col_identity) {
2169: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2170: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2171: } else {
2172: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2173: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2174: }
2175: #endif
2176: }
2177: B->ops->matsolve = NULL;
2178: B->ops->matsolvetranspose = NULL;
2180: /* get the triangular factors */
2181: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2182: PetscFunctionReturn(PETSC_SUCCESS);
2183: }
2185: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186: {
2187: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2189: PetscFunctionBegin;
2190: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2191: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2192: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2193: PetscFunctionReturn(PETSC_SUCCESS);
2194: }
2196: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2197: {
2198: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2200: PetscFunctionBegin;
2201: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2202: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2203: if (!info->factoronhost) {
2204: PetscCall(ISIdentity(isrow, &row_identity));
2205: PetscCall(ISIdentity(iscol, &col_identity));
2206: }
2207: if (!info->levels && row_identity && col_identity) {
2208: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2209: } else
2210: #endif
2211: {
2212: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2213: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2214: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2215: }
2216: PetscFunctionReturn(PETSC_SUCCESS);
2217: }
2219: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2220: {
2221: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2223: PetscFunctionBegin;
2224: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2225: PetscBool perm_identity = PETSC_FALSE;
2226: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2227: if (!info->levels && perm_identity) {
2228: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2229: } else
2230: #endif
2231: {
2232: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2233: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2234: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2235: }
2236: PetscFunctionReturn(PETSC_SUCCESS);
2237: }
2239: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2240: {
2241: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2243: PetscFunctionBegin;
2244: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2245: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2246: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2247: PetscFunctionReturn(PETSC_SUCCESS);
2248: }
2250: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2251: {
2252: PetscFunctionBegin;
2253: *type = MATSOLVERCUSPARSE;
2254: PetscFunctionReturn(PETSC_SUCCESS);
2255: }
2257: /*MC
2258: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2259: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2260: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2261: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2262: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2263: algorithms are not recommended. This class does NOT support direct solver operations.
2265: Level: beginner
2267: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2268: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2269: M*/
2271: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2272: {
2273: PetscInt n = A->rmap->n;
2275: PetscFunctionBegin;
2276: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2277: PetscCall(MatSetSizes(*B, n, n, n, n));
2278: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2279: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2281: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2282: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2283: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2284: if (!A->boundtocpu) {
2285: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2286: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2287: } else {
2288: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2289: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2290: }
2291: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2292: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2293: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2294: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2295: if (!A->boundtocpu) {
2296: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2297: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2298: } else {
2299: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2300: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2301: }
2302: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2303: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2304: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2306: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2307: (*B)->canuseordering = PETSC_TRUE;
2308: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2309: PetscFunctionReturn(PETSC_SUCCESS);
2310: }
2312: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2313: {
2314: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2315: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2316: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2318: #endif
2320: PetscFunctionBegin;
2321: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2322: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2323: if (A->factortype == MAT_FACTOR_NONE) {
2324: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2325: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2326: }
2327: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2328: else if (fs->csrVal) {
2329: /* We have a factorized matrix on device and are able to copy it to host */
2330: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331: }
2332: #endif
2333: else
2334: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2335: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2336: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2337: A->offloadmask = PETSC_OFFLOAD_BOTH;
2338: }
2339: PetscFunctionReturn(PETSC_SUCCESS);
2340: }
2342: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2343: {
2344: PetscFunctionBegin;
2345: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2346: *array = ((Mat_SeqAIJ *)A->data)->a;
2347: PetscFunctionReturn(PETSC_SUCCESS);
2348: }
2350: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2351: {
2352: PetscFunctionBegin;
2353: A->offloadmask = PETSC_OFFLOAD_CPU;
2354: *array = NULL;
2355: PetscFunctionReturn(PETSC_SUCCESS);
2356: }
2358: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2359: {
2360: PetscFunctionBegin;
2361: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2362: *array = ((Mat_SeqAIJ *)A->data)->a;
2363: PetscFunctionReturn(PETSC_SUCCESS);
2364: }
2366: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2367: {
2368: PetscFunctionBegin;
2369: *array = NULL;
2370: PetscFunctionReturn(PETSC_SUCCESS);
2371: }
2373: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2374: {
2375: PetscFunctionBegin;
2376: *array = ((Mat_SeqAIJ *)A->data)->a;
2377: PetscFunctionReturn(PETSC_SUCCESS);
2378: }
2380: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2381: {
2382: PetscFunctionBegin;
2383: A->offloadmask = PETSC_OFFLOAD_CPU;
2384: *array = NULL;
2385: PetscFunctionReturn(PETSC_SUCCESS);
2386: }
2388: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2389: {
2390: Mat_SeqAIJCUSPARSE *cusp;
2391: CsrMatrix *matrix;
2393: PetscFunctionBegin;
2394: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2395: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2396: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2397: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2398: matrix = (CsrMatrix *)cusp->mat->mat;
2400: if (i) {
2401: #if !defined(PETSC_USE_64BIT_INDICES)
2402: *i = matrix->row_offsets->data().get();
2403: #else
2404: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2405: #endif
2406: }
2407: if (j) {
2408: #if !defined(PETSC_USE_64BIT_INDICES)
2409: *j = matrix->column_indices->data().get();
2410: #else
2411: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2412: #endif
2413: }
2414: if (a) *a = matrix->values->data().get();
2415: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2416: PetscFunctionReturn(PETSC_SUCCESS);
2417: }
2419: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2420: {
2421: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2422: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2423: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2424: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2425: cusparseStatus_t stat;
2426: PetscBool both = PETSC_TRUE;
2428: PetscFunctionBegin;
2429: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2430: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2431: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2432: CsrMatrix *matrix;
2433: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2435: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2436: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2437: matrix->values->assign(a->a, a->a + a->nz);
2438: PetscCallCUDA(WaitForCUDA());
2439: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2440: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2441: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2442: } else {
2443: PetscInt nnz;
2444: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2445: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2446: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2447: delete cusparsestruct->workVector;
2448: delete cusparsestruct->rowoffsets_gpu;
2449: cusparsestruct->workVector = NULL;
2450: cusparsestruct->rowoffsets_gpu = NULL;
2451: try {
2452: if (a->compressedrow.use) {
2453: m = a->compressedrow.nrows;
2454: ii = a->compressedrow.i;
2455: ridx = a->compressedrow.rindex;
2456: } else {
2457: m = A->rmap->n;
2458: ii = a->i;
2459: ridx = NULL;
2460: }
2461: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2462: if (!a->a) {
2463: nnz = ii[m];
2464: both = PETSC_FALSE;
2465: } else nnz = a->nz;
2466: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2468: /* create cusparse matrix */
2469: cusparsestruct->nrows = m;
2470: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2471: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2472: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2473: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2475: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2476: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2477: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2478: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2479: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2480: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2481: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2483: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2484: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2485: /* set the matrix */
2486: CsrMatrix *mat = new CsrMatrix;
2487: mat->num_rows = m;
2488: mat->num_cols = A->cmap->n;
2489: mat->num_entries = nnz;
2490: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2491: mat->row_offsets->assign(ii, ii + m + 1);
2492: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2493: mat->column_indices->assign(a->j, a->j + nnz);
2495: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2496: if (a->a) mat->values->assign(a->a, a->a + nnz);
2498: /* assign the pointer */
2499: matstruct->mat = mat;
2500: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2501: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2502: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2503: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2504: PetscCallCUSPARSE(stat);
2505: }
2506: #endif
2507: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2508: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2509: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2510: #else
2511: CsrMatrix *mat = new CsrMatrix;
2512: mat->num_rows = m;
2513: mat->num_cols = A->cmap->n;
2514: mat->num_entries = nnz;
2515: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2516: mat->row_offsets->assign(ii, ii + m + 1);
2518: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2519: mat->column_indices->assign(a->j, a->j + nnz);
2521: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2522: if (a->a) mat->values->assign(a->a, a->a + nnz);
2524: cusparseHybMat_t hybMat;
2525: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2526: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2527: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2528: PetscCallCUSPARSE(stat);
2529: /* assign the pointer */
2530: matstruct->mat = hybMat;
2532: if (mat) {
2533: if (mat->values) delete (THRUSTARRAY *)mat->values;
2534: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2535: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2536: delete (CsrMatrix *)mat;
2537: }
2538: #endif
2539: }
2541: /* assign the compressed row indices */
2542: if (a->compressedrow.use) {
2543: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2544: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2545: matstruct->cprowIndices->assign(ridx, ridx + m);
2546: tmp = m;
2547: } else {
2548: cusparsestruct->workVector = NULL;
2549: matstruct->cprowIndices = NULL;
2550: tmp = 0;
2551: }
2552: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2554: /* assign the pointer */
2555: cusparsestruct->mat = matstruct;
2556: } catch (char *ex) {
2557: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2558: }
2559: PetscCallCUDA(WaitForCUDA());
2560: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2561: cusparsestruct->nonzerostate = A->nonzerostate;
2562: }
2563: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2564: }
2565: PetscFunctionReturn(PETSC_SUCCESS);
2566: }
2568: struct VecCUDAPlusEquals {
2569: template <typename Tuple>
2570: __host__ __device__ void operator()(Tuple t)
2571: {
2572: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2573: }
2574: };
2576: struct VecCUDAEquals {
2577: template <typename Tuple>
2578: __host__ __device__ void operator()(Tuple t)
2579: {
2580: thrust::get<1>(t) = thrust::get<0>(t);
2581: }
2582: };
2584: struct VecCUDAEqualsReverse {
2585: template <typename Tuple>
2586: __host__ __device__ void operator()(Tuple t)
2587: {
2588: thrust::get<0>(t) = thrust::get<1>(t);
2589: }
2590: };
2592: struct MatProductCtx_MatMatCusparse {
2593: PetscBool cisdense;
2594: PetscScalar *Bt;
2595: Mat X;
2596: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2597: PetscLogDouble flops;
2598: CsrMatrix *Bcsr;
2600: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2601: cusparseSpMatDescr_t matSpBDescr;
2602: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2603: cusparseDnMatDescr_t matBDescr;
2604: cusparseDnMatDescr_t matCDescr;
2605: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2606: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2607: void *dBuffer4;
2608: void *dBuffer5;
2609: #endif
2610: size_t mmBufferSize;
2611: void *mmBuffer;
2612: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2613: cusparseSpGEMMDescr_t spgemmDesc;
2614: #endif
2615: };
2617: static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data)
2618: {
2619: MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data;
2621: PetscFunctionBegin;
2622: PetscCallCUDA(cudaFree(mmdata->Bt));
2623: delete mmdata->Bcsr;
2624: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2625: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2626: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2627: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2628: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2629: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2630: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2631: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2632: #endif
2633: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2634: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2635: #endif
2636: PetscCall(MatDestroy(&mmdata->X));
2637: PetscCall(PetscFree(*data));
2638: PetscFunctionReturn(PETSC_SUCCESS);
2639: }
2641: #include <../src/mat/impls/dense/seq/dense.h>
2643: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2644: {
2645: Mat_Product *product = C->product;
2646: Mat A, B;
2647: PetscInt m, n, blda, clda;
2648: PetscBool flg, biscuda;
2649: Mat_SeqAIJCUSPARSE *cusp;
2650: cusparseStatus_t stat;
2651: cusparseOperation_t opA;
2652: const PetscScalar *barray;
2653: PetscScalar *carray;
2654: MatProductCtx_MatMatCusparse *mmdata;
2655: Mat_SeqAIJCUSPARSEMultStruct *mat;
2656: CsrMatrix *csrmat;
2658: PetscFunctionBegin;
2659: MatCheckProduct(C, 1);
2660: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2661: mmdata = (MatProductCtx_MatMatCusparse *)product->data;
2662: A = product->A;
2663: B = product->B;
2664: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2665: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2666: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2667: Instead of silently accepting the wrong answer, I prefer to raise the error */
2668: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2669: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2670: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2671: switch (product->type) {
2672: case MATPRODUCT_AB:
2673: case MATPRODUCT_PtAP:
2674: mat = cusp->mat;
2675: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2676: m = A->rmap->n;
2677: n = B->cmap->n;
2678: break;
2679: case MATPRODUCT_AtB:
2680: if (!A->form_explicit_transpose) {
2681: mat = cusp->mat;
2682: opA = CUSPARSE_OPERATION_TRANSPOSE;
2683: } else {
2684: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2685: mat = cusp->matTranspose;
2686: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2687: }
2688: m = A->cmap->n;
2689: n = B->cmap->n;
2690: break;
2691: case MATPRODUCT_ABt:
2692: case MATPRODUCT_RARt:
2693: mat = cusp->mat;
2694: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2695: m = A->rmap->n;
2696: n = B->rmap->n;
2697: break;
2698: default:
2699: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2700: }
2701: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2702: csrmat = (CsrMatrix *)mat->mat;
2703: /* if the user passed a CPU matrix, copy the data to the GPU */
2704: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2705: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2706: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2708: PetscCall(MatDenseGetLDA(B, &blda));
2709: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2710: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2711: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2712: } else {
2713: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2714: PetscCall(MatDenseGetLDA(C, &clda));
2715: }
2717: PetscCall(PetscLogGpuTimeBegin());
2718: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2719: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2720: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2721: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2722: #else
2723: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2724: #endif
2726: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2727: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2728: size_t mmBufferSize;
2729: if (mmdata->initialized && mmdata->Blda != blda) {
2730: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2731: mmdata->matBDescr = NULL;
2732: }
2733: if (!mmdata->matBDescr) {
2734: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2735: mmdata->Blda = blda;
2736: }
2738: if (mmdata->initialized && mmdata->Clda != clda) {
2739: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2740: mmdata->matCDescr = NULL;
2741: }
2742: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2743: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2744: mmdata->Clda = clda;
2745: }
2747: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2748: if (matADescr) {
2749: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2750: matADescr = NULL;
2751: }
2752: #endif
2754: if (!matADescr) {
2755: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2756: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2757: PetscCallCUSPARSE(stat);
2758: }
2760: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2762: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2763: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2764: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2765: mmdata->mmBufferSize = mmBufferSize;
2766: }
2768: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2769: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2770: #endif
2772: mmdata->initialized = PETSC_TRUE;
2773: } else {
2774: /* to be safe, always update pointers of the mats */
2775: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2776: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2777: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2778: }
2780: /* do cusparseSpMM, which supports transpose on B */
2781: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2782: #else
2783: PetscInt k;
2784: /* cusparseXcsrmm does not support transpose on B */
2785: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2786: cublasHandle_t cublasv2handle;
2787: cublasStatus_t cerr;
2789: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2790: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2791: PetscCallCUBLAS(cerr);
2792: blda = B->cmap->n;
2793: k = B->cmap->n;
2794: } else {
2795: k = B->rmap->n;
2796: }
2798: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2799: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2800: PetscCallCUSPARSE(stat);
2801: #endif
2802: PetscCall(PetscLogGpuTimeEnd());
2803: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2804: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2805: if (product->type == MATPRODUCT_RARt) {
2806: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2807: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2808: } else if (product->type == MATPRODUCT_PtAP) {
2809: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2810: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2811: } else {
2812: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2813: }
2814: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2815: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2816: PetscFunctionReturn(PETSC_SUCCESS);
2817: }
2819: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2820: {
2821: Mat_Product *product = C->product;
2822: Mat A, B;
2823: PetscInt m, n;
2824: PetscBool cisdense, flg;
2825: MatProductCtx_MatMatCusparse *mmdata;
2826: Mat_SeqAIJCUSPARSE *cusp;
2828: PetscFunctionBegin;
2829: MatCheckProduct(C, 1);
2830: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2831: A = product->A;
2832: B = product->B;
2833: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2834: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2835: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2836: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2837: switch (product->type) {
2838: case MATPRODUCT_AB:
2839: m = A->rmap->n;
2840: n = B->cmap->n;
2841: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2842: break;
2843: case MATPRODUCT_AtB:
2844: m = A->cmap->n;
2845: n = B->cmap->n;
2846: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2847: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2848: break;
2849: case MATPRODUCT_ABt:
2850: m = A->rmap->n;
2851: n = B->rmap->n;
2852: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2853: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2854: break;
2855: case MATPRODUCT_PtAP:
2856: m = B->cmap->n;
2857: n = B->cmap->n;
2858: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2859: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2860: break;
2861: case MATPRODUCT_RARt:
2862: m = B->rmap->n;
2863: n = B->rmap->n;
2864: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2865: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2866: break;
2867: default:
2868: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2869: }
2870: PetscCall(MatSetSizes(C, m, n, m, n));
2871: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2872: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2873: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2875: /* product data */
2876: PetscCall(PetscNew(&mmdata));
2877: mmdata->cisdense = cisdense;
2878: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2879: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2880: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2881: #endif
2882: /* for these products we need intermediate storage */
2883: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2884: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2885: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2886: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2887: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2888: } else {
2889: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2890: }
2891: }
2892: C->product->data = mmdata;
2893: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
2895: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2896: PetscFunctionReturn(PETSC_SUCCESS);
2897: }
2899: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2900: {
2901: Mat_Product *product = C->product;
2902: Mat A, B;
2903: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2904: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2905: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2906: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2907: PetscBool flg;
2908: cusparseStatus_t stat;
2909: MatProductType ptype;
2910: MatProductCtx_MatMatCusparse *mmdata;
2911: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2912: cusparseSpMatDescr_t BmatSpDescr;
2913: #endif
2914: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2916: PetscFunctionBegin;
2917: MatCheckProduct(C, 1);
2918: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2919: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2920: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2921: mmdata = (MatProductCtx_MatMatCusparse *)C->product->data;
2922: A = product->A;
2923: B = product->B;
2924: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2925: mmdata->reusesym = PETSC_FALSE;
2926: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2927: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2928: Cmat = Ccusp->mat;
2929: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2930: Ccsr = (CsrMatrix *)Cmat->mat;
2931: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2932: goto finalize;
2933: }
2934: if (!c->nz) goto finalize;
2935: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2936: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2937: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2938: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2939: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2940: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2941: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2942: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2943: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2944: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2945: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2946: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2947: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2948: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2950: ptype = product->type;
2951: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2952: ptype = MATPRODUCT_AB;
2953: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2954: }
2955: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2956: ptype = MATPRODUCT_AB;
2957: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2958: }
2959: switch (ptype) {
2960: case MATPRODUCT_AB:
2961: Amat = Acusp->mat;
2962: Bmat = Bcusp->mat;
2963: break;
2964: case MATPRODUCT_AtB:
2965: Amat = Acusp->matTranspose;
2966: Bmat = Bcusp->mat;
2967: break;
2968: case MATPRODUCT_ABt:
2969: Amat = Acusp->mat;
2970: Bmat = Bcusp->matTranspose;
2971: break;
2972: default:
2973: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2974: }
2975: Cmat = Ccusp->mat;
2976: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2977: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2978: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2979: Acsr = (CsrMatrix *)Amat->mat;
2980: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2981: Ccsr = (CsrMatrix *)Cmat->mat;
2982: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2983: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2984: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2985: PetscCall(PetscLogGpuTimeBegin());
2986: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2987: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2988: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2989: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2990: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2991: PetscCallCUSPARSE(stat);
2992: #else
2993: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2994: PetscCallCUSPARSE(stat);
2995: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2996: PetscCallCUSPARSE(stat);
2997: #endif
2998: #else
2999: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3000: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3001: PetscCallCUSPARSE(stat);
3002: #endif
3003: PetscCall(PetscLogGpuFlops(mmdata->flops));
3004: PetscCallCUDA(WaitForCUDA());
3005: PetscCall(PetscLogGpuTimeEnd());
3006: C->offloadmask = PETSC_OFFLOAD_GPU;
3007: finalize:
3008: /* shorter version of MatAssemblyEnd_SeqAIJ */
3009: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3010: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3011: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3012: c->reallocs = 0;
3013: C->info.mallocs += 0;
3014: C->info.nz_unneeded = 0;
3015: C->assembled = C->was_assembled = PETSC_TRUE;
3016: C->num_ass++;
3017: PetscFunctionReturn(PETSC_SUCCESS);
3018: }
3020: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3021: {
3022: Mat_Product *product = C->product;
3023: Mat A, B;
3024: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3025: Mat_SeqAIJ *a, *b, *c;
3026: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3027: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3028: PetscInt i, j, m, n, k;
3029: PetscBool flg;
3030: cusparseStatus_t stat;
3031: MatProductType ptype;
3032: MatProductCtx_MatMatCusparse *mmdata;
3033: PetscLogDouble flops;
3034: PetscBool biscompressed, ciscompressed;
3035: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3036: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3037: cusparseSpMatDescr_t BmatSpDescr;
3038: #else
3039: int cnz;
3040: #endif
3041: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3043: PetscFunctionBegin;
3044: MatCheckProduct(C, 1);
3045: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3046: A = product->A;
3047: B = product->B;
3048: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3049: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3050: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3051: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3052: a = (Mat_SeqAIJ *)A->data;
3053: b = (Mat_SeqAIJ *)B->data;
3054: /* product data */
3055: PetscCall(PetscNew(&mmdata));
3056: C->product->data = mmdata;
3057: C->product->destroy = MatProductCtxDestroy_MatMatCusparse;
3059: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3060: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3061: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3062: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3063: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3064: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3066: ptype = product->type;
3067: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3068: ptype = MATPRODUCT_AB;
3069: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3070: }
3071: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3072: ptype = MATPRODUCT_AB;
3073: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3074: }
3075: biscompressed = PETSC_FALSE;
3076: ciscompressed = PETSC_FALSE;
3077: switch (ptype) {
3078: case MATPRODUCT_AB:
3079: m = A->rmap->n;
3080: n = B->cmap->n;
3081: k = A->cmap->n;
3082: Amat = Acusp->mat;
3083: Bmat = Bcusp->mat;
3084: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3085: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3086: break;
3087: case MATPRODUCT_AtB:
3088: m = A->cmap->n;
3089: n = B->cmap->n;
3090: k = A->rmap->n;
3091: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3092: Amat = Acusp->matTranspose;
3093: Bmat = Bcusp->mat;
3094: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3095: break;
3096: case MATPRODUCT_ABt:
3097: m = A->rmap->n;
3098: n = B->rmap->n;
3099: k = A->cmap->n;
3100: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3101: Amat = Acusp->mat;
3102: Bmat = Bcusp->matTranspose;
3103: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3104: break;
3105: default:
3106: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3107: }
3109: /* create cusparse matrix */
3110: PetscCall(MatSetSizes(C, m, n, m, n));
3111: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3112: c = (Mat_SeqAIJ *)C->data;
3113: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3114: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3115: Ccsr = new CsrMatrix;
3117: c->compressedrow.use = ciscompressed;
3118: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3119: c->compressedrow.nrows = a->compressedrow.nrows;
3120: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3121: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3122: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3123: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3124: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3125: } else {
3126: c->compressedrow.nrows = 0;
3127: c->compressedrow.i = NULL;
3128: c->compressedrow.rindex = NULL;
3129: Ccusp->workVector = NULL;
3130: Cmat->cprowIndices = NULL;
3131: }
3132: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3133: Ccusp->mat = Cmat;
3134: Ccusp->mat->mat = Ccsr;
3135: Ccsr->num_rows = Ccusp->nrows;
3136: Ccsr->num_cols = n;
3137: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3138: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3139: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3140: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3141: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3142: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3143: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3144: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3145: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3146: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3147: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3148: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3149: c->nz = 0;
3150: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3151: Ccsr->values = new THRUSTARRAY(c->nz);
3152: goto finalizesym;
3153: }
3155: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3156: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3157: Acsr = (CsrMatrix *)Amat->mat;
3158: if (!biscompressed) {
3159: Bcsr = (CsrMatrix *)Bmat->mat;
3160: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3161: BmatSpDescr = Bmat->matDescr;
3162: #endif
3163: } else { /* we need to use row offsets for the full matrix */
3164: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3165: Bcsr = new CsrMatrix;
3166: Bcsr->num_rows = B->rmap->n;
3167: Bcsr->num_cols = cBcsr->num_cols;
3168: Bcsr->num_entries = cBcsr->num_entries;
3169: Bcsr->column_indices = cBcsr->column_indices;
3170: Bcsr->values = cBcsr->values;
3171: if (!Bcusp->rowoffsets_gpu) {
3172: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3173: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3174: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3175: }
3176: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3177: mmdata->Bcsr = Bcsr;
3178: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3179: if (Bcsr->num_rows && Bcsr->num_cols) {
3180: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3181: PetscCallCUSPARSE(stat);
3182: }
3183: BmatSpDescr = mmdata->matSpBDescr;
3184: #endif
3185: }
3186: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3187: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3188: /* precompute flops count */
3189: if (ptype == MATPRODUCT_AB) {
3190: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3191: const PetscInt st = a->i[i];
3192: const PetscInt en = a->i[i + 1];
3193: for (j = st; j < en; j++) {
3194: const PetscInt brow = a->j[j];
3195: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3196: }
3197: }
3198: } else if (ptype == MATPRODUCT_AtB) {
3199: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3200: const PetscInt anzi = a->i[i + 1] - a->i[i];
3201: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3202: flops += (2. * anzi) * bnzi;
3203: }
3204: } else { /* TODO */
3205: flops = 0.;
3206: }
3208: mmdata->flops = flops;
3209: PetscCall(PetscLogGpuTimeBegin());
3211: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3212: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3213: // cuda-12.2 requires non-null csrRowOffsets
3214: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3215: PetscCallCUSPARSE(stat);
3216: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3217: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3218: {
3219: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3220: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3221: */
3222: void *dBuffer1 = NULL;
3223: void *dBuffer2 = NULL;
3224: void *dBuffer3 = NULL;
3225: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3226: size_t bufferSize1 = 0;
3227: size_t bufferSize2 = 0;
3228: size_t bufferSize3 = 0;
3229: size_t bufferSize4 = 0;
3230: size_t bufferSize5 = 0;
3232: /* ask bufferSize1 bytes for external memory */
3233: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3234: PetscCallCUSPARSE(stat);
3235: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3236: /* inspect the matrices A and B to understand the memory requirement for the next step */
3237: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3238: PetscCallCUSPARSE(stat);
3240: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3241: PetscCallCUSPARSE(stat);
3242: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3243: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3244: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3245: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3246: PetscCallCUSPARSE(stat);
3247: PetscCallCUDA(cudaFree(dBuffer1));
3248: PetscCallCUDA(cudaFree(dBuffer2));
3250: /* get matrix C non-zero entries C_nnz1 */
3251: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3252: c->nz = (PetscInt)C_nnz1;
3253: /* allocate matrix C */
3254: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3255: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3256: Ccsr->values = new THRUSTARRAY(c->nz);
3257: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3258: /* update matC with the new pointers */
3259: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3260: PetscCallCUSPARSE(stat);
3262: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3263: PetscCallCUSPARSE(stat);
3264: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3265: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3266: PetscCallCUSPARSE(stat);
3267: PetscCallCUDA(cudaFree(dBuffer3));
3268: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3269: PetscCallCUSPARSE(stat);
3270: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3271: }
3272: #else
3273: size_t bufSize2;
3274: /* ask bufferSize bytes for external memory */
3275: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3276: PetscCallCUSPARSE(stat);
3277: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3278: /* inspect the matrices A and B to understand the memory requirement for the next step */
3279: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3280: PetscCallCUSPARSE(stat);
3281: /* ask bufferSize again bytes for external memory */
3282: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3283: PetscCallCUSPARSE(stat);
3284: /* The CUSPARSE documentation is not clear, nor the API
3285: We need both buffers to perform the operations properly!
3286: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3287: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3288: is stored in the descriptor! What a messy API... */
3289: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3290: /* compute the intermediate product of A * B */
3291: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3292: PetscCallCUSPARSE(stat);
3293: /* get matrix C non-zero entries C_nnz1 */
3294: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3295: c->nz = (PetscInt)C_nnz1;
3296: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3297: mmdata->mmBufferSize / 1024));
3298: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3299: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3300: Ccsr->values = new THRUSTARRAY(c->nz);
3301: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3302: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3303: PetscCallCUSPARSE(stat);
3304: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3305: PetscCallCUSPARSE(stat);
3306: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3307: #else
3308: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3309: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3310: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3311: PetscCallCUSPARSE(stat);
3312: c->nz = cnz;
3313: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3314: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3315: Ccsr->values = new THRUSTARRAY(c->nz);
3316: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3318: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3319: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3320: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3321: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3322: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3323: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3324: PetscCallCUSPARSE(stat);
3325: #endif
3326: PetscCall(PetscLogGpuFlops(mmdata->flops));
3327: PetscCall(PetscLogGpuTimeEnd());
3328: finalizesym:
3329: c->free_a = PETSC_TRUE;
3330: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3331: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3332: c->free_ij = PETSC_TRUE;
3333: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3334: PetscInt *d_i = c->i;
3335: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3336: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3337: ii = *Ccsr->row_offsets;
3338: jj = *Ccsr->column_indices;
3339: if (ciscompressed) d_i = c->compressedrow.i;
3340: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3341: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3342: } else {
3343: PetscInt *d_i = c->i;
3344: if (ciscompressed) d_i = c->compressedrow.i;
3345: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3347: }
3348: if (ciscompressed) { /* need to expand host row offsets */
3349: PetscInt r = 0;
3350: c->i[0] = 0;
3351: for (k = 0; k < c->compressedrow.nrows; k++) {
3352: const PetscInt next = c->compressedrow.rindex[k];
3353: const PetscInt old = c->compressedrow.i[k];
3354: for (; r < next; r++) c->i[r + 1] = old;
3355: }
3356: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3357: }
3358: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3359: PetscCall(PetscMalloc1(m, &c->ilen));
3360: PetscCall(PetscMalloc1(m, &c->imax));
3361: c->maxnz = c->nz;
3362: c->nonzerorowcnt = 0;
3363: c->rmax = 0;
3364: for (k = 0; k < m; k++) {
3365: const PetscInt nn = c->i[k + 1] - c->i[k];
3366: c->ilen[k] = c->imax[k] = nn;
3367: c->nonzerorowcnt += (PetscInt)!!nn;
3368: c->rmax = PetscMax(c->rmax, nn);
3369: }
3370: PetscCall(PetscMalloc1(c->nz, &c->a));
3371: Ccsr->num_entries = c->nz;
3373: C->nonzerostate++;
3374: PetscCall(PetscLayoutSetUp(C->rmap));
3375: PetscCall(PetscLayoutSetUp(C->cmap));
3376: Ccusp->nonzerostate = C->nonzerostate;
3377: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3378: C->preallocated = PETSC_TRUE;
3379: C->assembled = PETSC_FALSE;
3380: C->was_assembled = PETSC_FALSE;
3381: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3382: mmdata->reusesym = PETSC_TRUE;
3383: C->offloadmask = PETSC_OFFLOAD_GPU;
3384: }
3385: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3386: PetscFunctionReturn(PETSC_SUCCESS);
3387: }
3389: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3391: /* handles sparse or dense B */
3392: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3393: {
3394: Mat_Product *product = mat->product;
3395: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3397: PetscFunctionBegin;
3398: MatCheckProduct(mat, 1);
3399: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3400: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3401: if (product->type == MATPRODUCT_ABC) {
3402: Ciscusp = PETSC_FALSE;
3403: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3404: }
3405: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3406: PetscBool usecpu = PETSC_FALSE;
3407: switch (product->type) {
3408: case MATPRODUCT_AB:
3409: if (product->api_user) {
3410: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3411: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3412: PetscOptionsEnd();
3413: } else {
3414: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3415: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3416: PetscOptionsEnd();
3417: }
3418: break;
3419: case MATPRODUCT_AtB:
3420: if (product->api_user) {
3421: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3422: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3423: PetscOptionsEnd();
3424: } else {
3425: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3426: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3427: PetscOptionsEnd();
3428: }
3429: break;
3430: case MATPRODUCT_PtAP:
3431: if (product->api_user) {
3432: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3433: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3434: PetscOptionsEnd();
3435: } else {
3436: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3437: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3438: PetscOptionsEnd();
3439: }
3440: break;
3441: case MATPRODUCT_RARt:
3442: if (product->api_user) {
3443: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3444: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3445: PetscOptionsEnd();
3446: } else {
3447: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3448: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3449: PetscOptionsEnd();
3450: }
3451: break;
3452: case MATPRODUCT_ABC:
3453: if (product->api_user) {
3454: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3455: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3456: PetscOptionsEnd();
3457: } else {
3458: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3459: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3460: PetscOptionsEnd();
3461: }
3462: break;
3463: default:
3464: break;
3465: }
3466: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3467: }
3468: /* dispatch */
3469: if (isdense) {
3470: switch (product->type) {
3471: case MATPRODUCT_AB:
3472: case MATPRODUCT_AtB:
3473: case MATPRODUCT_ABt:
3474: case MATPRODUCT_PtAP:
3475: case MATPRODUCT_RARt:
3476: if (product->A->boundtocpu) {
3477: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3478: } else {
3479: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3480: }
3481: break;
3482: case MATPRODUCT_ABC:
3483: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3484: break;
3485: default:
3486: break;
3487: }
3488: } else if (Biscusp && Ciscusp) {
3489: switch (product->type) {
3490: case MATPRODUCT_AB:
3491: case MATPRODUCT_AtB:
3492: case MATPRODUCT_ABt:
3493: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3494: break;
3495: case MATPRODUCT_PtAP:
3496: case MATPRODUCT_RARt:
3497: case MATPRODUCT_ABC:
3498: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3499: break;
3500: default:
3501: break;
3502: }
3503: } else { /* fallback for AIJ */
3504: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3505: }
3506: PetscFunctionReturn(PETSC_SUCCESS);
3507: }
3509: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510: {
3511: PetscFunctionBegin;
3512: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3513: PetscFunctionReturn(PETSC_SUCCESS);
3514: }
3516: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3517: {
3518: PetscFunctionBegin;
3519: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3520: PetscFunctionReturn(PETSC_SUCCESS);
3521: }
3523: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3524: {
3525: PetscFunctionBegin;
3526: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3527: PetscFunctionReturn(PETSC_SUCCESS);
3528: }
3530: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3531: {
3532: PetscFunctionBegin;
3533: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3534: PetscFunctionReturn(PETSC_SUCCESS);
3535: }
3537: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3538: {
3539: PetscFunctionBegin;
3540: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3541: PetscFunctionReturn(PETSC_SUCCESS);
3542: }
3544: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3545: {
3546: int i = blockIdx.x * blockDim.x + threadIdx.x;
3547: if (i < n) y[idx[i]] += x[i];
3548: }
3550: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3551: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3552: {
3553: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3554: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3555: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3556: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3557: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3558: PetscBool compressed;
3559: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3560: PetscInt nx, ny;
3561: #endif
3563: PetscFunctionBegin;
3564: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3565: if (!a->nz) {
3566: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3567: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3568: PetscFunctionReturn(PETSC_SUCCESS);
3569: }
3570: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3571: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3572: if (!trans) {
3573: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3574: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3575: } else {
3576: if (herm || !A->form_explicit_transpose) {
3577: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3578: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579: } else {
3580: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3581: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3582: }
3583: }
3584: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3585: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3587: try {
3588: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3589: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3590: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3592: PetscCall(PetscLogGpuTimeBegin());
3593: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3594: /* z = A x + beta y.
3595: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3596: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3597: */
3598: xptr = xarray;
3599: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3600: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3601: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3603: allocated to accommodate different uses. So we get the length info directly from mat.
3604: */
3605: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3606: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3607: nx = mat->num_cols; // since y = Ax
3608: ny = mat->num_rows;
3609: }
3610: #endif
3611: } else {
3612: /* z = A^T x + beta y
3613: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3614: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3615: */
3616: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3617: dptr = zarray;
3618: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3619: if (compressed) { /* Scatter x to work vector */
3620: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3622: thrust::for_each(
3623: #if PetscDefined(HAVE_THRUST_ASYNC)
3624: thrust::cuda::par.on(PetscDefaultCudaStream),
3625: #endif
3626: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3627: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3628: }
3629: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3630: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3631: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3632: nx = mat->num_rows; // since y = A^T x
3633: ny = mat->num_cols;
3634: }
3635: #endif
3636: }
3638: /* csr_spmv does y = alpha op(A) x + beta y */
3639: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3640: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3642: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3643: #else
3644: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3645: #endif
3647: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3648: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3649: if (!matDescr) {
3650: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3651: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3652: }
3653: #endif
3655: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3656: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3657: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3658: PetscCallCUSPARSE(
3659: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3660: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3661: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3662: PetscCallCUSPARSE(
3663: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3664: #endif
3665: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3666: } else {
3667: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3668: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3669: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3670: }
3672: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3673: #else
3674: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3675: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3676: #endif
3677: } else {
3678: if (cusparsestruct->nrows) {
3679: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3680: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681: #else
3682: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3683: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3684: #endif
3685: }
3686: }
3687: PetscCall(PetscLogGpuTimeEnd());
3689: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3690: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3691: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3692: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3693: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3694: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3695: }
3696: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3697: PetscCall(VecSeq_CUDA::Set(zz, 0));
3698: }
3700: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3701: if (compressed) {
3702: PetscCall(PetscLogGpuTimeBegin());
3703: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3704: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3705: PetscCall(PetscLogGpuTimeEnd());
3706: }
3707: } else {
3708: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3709: }
3710: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3711: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3712: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3713: } catch (char *ex) {
3714: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3715: }
3716: if (yy) {
3717: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3718: } else {
3719: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3720: }
3721: PetscFunctionReturn(PETSC_SUCCESS);
3722: }
3724: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3725: {
3726: PetscFunctionBegin;
3727: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3728: PetscFunctionReturn(PETSC_SUCCESS);
3729: }
3731: PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx);
3733: __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag)
3734: {
3735: const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
3737: if (x < len) {
3738: const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx;
3739: PetscScalar d = 0.0;
3741: for (PetscInt i = 0; i < num_non0_row; i++) {
3742: if (col[i + rowx] == x) {
3743: d = val[i + rowx];
3744: break;
3745: }
3746: }
3747: diag[x] = d;
3748: }
3749: }
3751: static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag)
3752: {
3753: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3754: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3755: PetscScalar *darray;
3757: PetscFunctionBegin;
3758: if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) {
3759: PetscInt n = A->rmap->n;
3760: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3762: PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported");
3763: if (n > 0) {
3764: PetscCall(VecCUDAGetArrayWrite(diag, &darray));
3765: GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray);
3766: PetscCallCUDA(cudaPeekAtLastError());
3767: PetscCall(VecCUDARestoreArrayWrite(diag, &darray));
3768: }
3769: } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag));
3770: PetscFunctionReturn(PETSC_SUCCESS);
3771: }
3773: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3774: {
3775: PetscFunctionBegin;
3776: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3777: PetscFunctionReturn(PETSC_SUCCESS);
3778: }
3780: /*@
3781: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3783: Collective
3785: Input Parameters:
3786: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3787: . m - number of rows
3788: . n - number of columns
3789: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3790: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3792: Output Parameter:
3793: . A - the matrix
3795: Level: intermediate
3797: Notes:
3798: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3799: calculations. For good matrix assembly performance the user should preallocate the matrix
3800: storage by setting the parameter `nz` (or the array `nnz`).
3802: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3803: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3804: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3806: The AIJ format, also called
3807: compressed row storage, is fully compatible with standard Fortran
3808: storage. That is, the stored row and column indices can begin at
3809: either one (as in Fortran) or zero.
3811: Specify the preallocated storage with either nz or nnz (not both).
3812: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3813: allocation.
3815: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3817: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3818: `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3819: @*/
3820: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3821: {
3822: PetscFunctionBegin;
3823: PetscCall(MatCreate(comm, A));
3824: PetscCall(MatSetSizes(*A, m, n, m, n));
3825: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3826: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3827: PetscFunctionReturn(PETSC_SUCCESS);
3828: }
3830: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3831: {
3832: PetscFunctionBegin;
3833: if (A->factortype == MAT_FACTOR_NONE) {
3834: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3835: } else {
3836: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3837: }
3838: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3839: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3840: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3841: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3842: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3843: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3844: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3845: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3846: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3847: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3848: PetscCall(MatDestroy_SeqAIJ(A));
3849: PetscFunctionReturn(PETSC_SUCCESS);
3850: }
3852: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3853: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3854: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3855: {
3856: PetscFunctionBegin;
3857: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3858: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3859: PetscFunctionReturn(PETSC_SUCCESS);
3860: }
3862: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3863: {
3864: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3865: Mat_SeqAIJCUSPARSE *cy;
3866: Mat_SeqAIJCUSPARSE *cx;
3867: PetscScalar *ay;
3868: const PetscScalar *ax;
3869: CsrMatrix *csry, *csrx;
3871: PetscFunctionBegin;
3872: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3873: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3874: if (X->ops->axpy != Y->ops->axpy) {
3875: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3876: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3877: PetscFunctionReturn(PETSC_SUCCESS);
3878: }
3879: /* if we are here, it means both matrices are bound to GPU */
3880: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3881: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3882: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3883: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3884: csry = (CsrMatrix *)cy->mat->mat;
3885: csrx = (CsrMatrix *)cx->mat->mat;
3886: /* see if we can turn this into a cublas axpy */
3887: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3888: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3889: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3890: if (eq) str = SAME_NONZERO_PATTERN;
3891: }
3892: /* spgeam is buggy with one column */
3893: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3895: if (str == SUBSET_NONZERO_PATTERN) {
3896: PetscScalar b = 1.0;
3897: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3898: size_t bufferSize;
3899: void *buffer;
3900: #endif
3902: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3903: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3904: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3905: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3906: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3907: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3908: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3909: PetscCall(PetscLogGpuTimeBegin());
3910: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3911: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3912: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3913: PetscCall(PetscLogGpuTimeEnd());
3914: PetscCallCUDA(cudaFree(buffer));
3915: #else
3916: PetscCall(PetscLogGpuTimeBegin());
3917: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3918: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3919: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3920: PetscCall(PetscLogGpuTimeEnd());
3921: #endif
3922: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3923: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3924: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3925: } else if (str == SAME_NONZERO_PATTERN) {
3926: cublasHandle_t cublasv2handle;
3927: PetscBLASInt one = 1, bnz = 1;
3929: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3930: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3931: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3932: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3933: PetscCall(PetscLogGpuTimeBegin());
3934: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3935: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3936: PetscCall(PetscLogGpuTimeEnd());
3937: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3938: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3939: } else {
3940: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3941: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3942: }
3943: PetscFunctionReturn(PETSC_SUCCESS);
3944: }
3946: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3947: {
3948: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3949: PetscScalar *ay;
3950: cublasHandle_t cublasv2handle;
3951: PetscBLASInt one = 1, bnz = 1;
3953: PetscFunctionBegin;
3954: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3955: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3956: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3957: PetscCall(PetscLogGpuTimeBegin());
3958: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3959: PetscCall(PetscLogGpuFlops(bnz));
3960: PetscCall(PetscLogGpuTimeEnd());
3961: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3962: PetscFunctionReturn(PETSC_SUCCESS);
3963: }
3965: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3966: {
3967: PetscBool gpu = PETSC_FALSE;
3968: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3970: PetscFunctionBegin;
3971: if (A->factortype == MAT_FACTOR_NONE) {
3972: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3973: if (spptr->mat) {
3974: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3975: if (matrix->values) {
3976: gpu = PETSC_TRUE;
3977: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3978: }
3979: }
3980: if (spptr->matTranspose) {
3981: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3982: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3983: }
3984: }
3985: if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU;
3986: else {
3987: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3988: A->offloadmask = PETSC_OFFLOAD_CPU;
3989: }
3990: PetscFunctionReturn(PETSC_SUCCESS);
3991: }
3993: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3994: {
3995: PetscFunctionBegin;
3996: *m = PETSC_MEMTYPE_CUDA;
3997: PetscFunctionReturn(PETSC_SUCCESS);
3998: }
4000: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
4001: {
4002: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4004: PetscFunctionBegin;
4005: if (A->factortype != MAT_FACTOR_NONE) {
4006: A->boundtocpu = flg;
4007: PetscFunctionReturn(PETSC_SUCCESS);
4008: }
4009: if (flg) {
4010: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4012: A->ops->scale = MatScale_SeqAIJ;
4013: A->ops->getdiagonal = MatGetDiagonal_SeqAIJ;
4014: A->ops->axpy = MatAXPY_SeqAIJ;
4015: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
4016: A->ops->mult = MatMult_SeqAIJ;
4017: A->ops->multadd = MatMultAdd_SeqAIJ;
4018: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
4019: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
4020: A->ops->multhermitiantranspose = NULL;
4021: A->ops->multhermitiantransposeadd = NULL;
4022: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
4023: A->ops->getcurrentmemtype = NULL;
4024: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
4025: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
4026: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
4027: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
4028: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4029: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4030: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4031: } else {
4032: A->ops->scale = MatScale_SeqAIJCUSPARSE;
4033: A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE;
4034: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
4035: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
4036: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4037: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4038: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4039: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4040: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4041: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4042: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4043: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4044: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4045: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4046: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4047: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4048: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4049: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4050: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4052: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4053: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4054: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4055: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4056: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4057: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4058: }
4059: A->boundtocpu = flg;
4060: if (flg && a->inode.size_csr) {
4061: a->inode.use = PETSC_TRUE;
4062: } else {
4063: a->inode.use = PETSC_FALSE;
4064: }
4065: PetscFunctionReturn(PETSC_SUCCESS);
4066: }
4068: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4069: {
4070: Mat B;
4072: PetscFunctionBegin;
4073: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4074: if (reuse == MAT_INITIAL_MATRIX) {
4075: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4076: } else if (reuse == MAT_REUSE_MATRIX) {
4077: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4078: }
4079: B = *newmat;
4081: PetscCall(PetscFree(B->defaultvectype));
4082: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4084: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4085: if (B->factortype == MAT_FACTOR_NONE) {
4086: Mat_SeqAIJCUSPARSE *spptr;
4087: PetscCall(PetscNew(&spptr));
4088: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4089: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4090: spptr->format = MAT_CUSPARSE_CSR;
4091: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4092: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4093: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4094: #else
4095: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4096: #endif
4097: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4098: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4099: #endif
4100: B->spptr = spptr;
4101: } else {
4102: Mat_SeqAIJCUSPARSETriFactors *spptr;
4104: PetscCall(PetscNew(&spptr));
4105: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4106: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4107: B->spptr = spptr;
4108: }
4109: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4110: }
4111: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4112: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4113: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4114: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4115: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4116: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4117: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4119: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4120: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4121: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4122: #if defined(PETSC_HAVE_HYPRE)
4123: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4124: #endif
4125: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4126: PetscFunctionReturn(PETSC_SUCCESS);
4127: }
4129: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4130: {
4131: PetscFunctionBegin;
4132: PetscCall(MatCreate_SeqAIJ(B));
4133: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4134: PetscFunctionReturn(PETSC_SUCCESS);
4135: }
4137: /*MC
4138: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4140: Options Database Keys:
4141: + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4142: . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4143: Other options include ell (ellpack) or hyb (hybrid).
4144: . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4145: - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
4147: Level: beginner
4149: Notes:
4150: These matrices can be in either CSR, ELL, or HYB format.
4152: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4154: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4155: if some integer values passed in do not fit in `int`.
4157: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4158: M*/
4160: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4161: {
4162: PetscFunctionBegin;
4163: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4164: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4165: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4166: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4167: PetscFunctionReturn(PETSC_SUCCESS);
4168: }
4170: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4171: {
4172: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4174: PetscFunctionBegin;
4175: if (cusp) {
4176: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4177: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4178: delete cusp->workVector;
4179: delete cusp->rowoffsets_gpu;
4180: delete cusp->csr2csc_i;
4181: delete cusp->coords;
4182: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4183: PetscCall(PetscFree(mat->spptr));
4184: }
4185: PetscFunctionReturn(PETSC_SUCCESS);
4186: }
4188: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4189: {
4190: PetscFunctionBegin;
4191: if (*mat) {
4192: delete (*mat)->values;
4193: delete (*mat)->column_indices;
4194: delete (*mat)->row_offsets;
4195: delete *mat;
4196: *mat = 0;
4197: }
4198: PetscFunctionReturn(PETSC_SUCCESS);
4199: }
4201: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4202: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4203: {
4204: PetscFunctionBegin;
4205: if (*trifactor) {
4206: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4207: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4208: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4209: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4210: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4211: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4212: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4213: #endif
4214: PetscCall(PetscFree(*trifactor));
4215: }
4216: PetscFunctionReturn(PETSC_SUCCESS);
4217: }
4218: #endif
4220: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4221: {
4222: CsrMatrix *mat;
4224: PetscFunctionBegin;
4225: if (*matstruct) {
4226: if ((*matstruct)->mat) {
4227: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4228: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4229: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4230: #else
4231: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4232: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4233: #endif
4234: } else {
4235: mat = (CsrMatrix *)(*matstruct)->mat;
4236: PetscCall(CsrMatrix_Destroy(&mat));
4237: }
4238: }
4239: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4240: delete (*matstruct)->cprowIndices;
4241: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4242: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4243: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4245: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4246: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4247: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4249: for (int i = 0; i < 3; i++) {
4250: if (mdata->cuSpMV[i].initialized) {
4251: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4252: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4253: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4254: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4255: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4256: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4257: #endif
4258: }
4259: }
4260: #endif
4261: delete *matstruct;
4262: *matstruct = NULL;
4263: }
4264: PetscFunctionReturn(PETSC_SUCCESS);
4265: }
4267: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4268: {
4269: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4271: PetscFunctionBegin;
4272: if (fs) {
4273: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4274: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4275: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4276: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4277: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4278: delete fs->workVector;
4279: fs->workVector = NULL;
4280: #endif
4281: delete fs->rpermIndices;
4282: delete fs->cpermIndices;
4283: fs->rpermIndices = NULL;
4284: fs->cpermIndices = NULL;
4285: fs->init_dev_prop = PETSC_FALSE;
4286: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4287: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4288: PetscCallCUDA(cudaFree(fs->csrColIdx));
4289: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4290: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4291: PetscCallCUDA(cudaFree(fs->csrVal));
4292: PetscCallCUDA(cudaFree(fs->diag));
4293: PetscCallCUDA(cudaFree(fs->X));
4294: PetscCallCUDA(cudaFree(fs->Y));
4295: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4296: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4297: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4298: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4299: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4300: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4301: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4302: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4303: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4304: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4305: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4306: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4307: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4308: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4309: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4310: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4311: PetscCall(PetscFree(fs->csrRowPtr_h));
4312: PetscCall(PetscFree(fs->csrVal_h));
4313: PetscCall(PetscFree(fs->diag_h));
4314: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4315: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4316: #endif
4317: }
4318: PetscFunctionReturn(PETSC_SUCCESS);
4319: }
4321: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4322: {
4323: PetscFunctionBegin;
4324: if (*trifactors) {
4325: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4326: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4327: PetscCall(PetscFree(*trifactors));
4328: }
4329: PetscFunctionReturn(PETSC_SUCCESS);
4330: }
4332: struct IJCompare {
4333: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4334: {
4335: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4336: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4337: return false;
4338: }
4339: };
4341: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4342: {
4343: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4345: PetscFunctionBegin;
4346: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4347: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4348: if (destroy) {
4349: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4350: delete cusp->csr2csc_i;
4351: cusp->csr2csc_i = NULL;
4352: }
4353: A->transupdated = PETSC_FALSE;
4354: PetscFunctionReturn(PETSC_SUCCESS);
4355: }
4357: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4358: {
4359: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4361: PetscFunctionBegin;
4362: PetscCallCUDA(cudaFree(coo->perm));
4363: PetscCallCUDA(cudaFree(coo->jmap));
4364: PetscCall(PetscFree(coo));
4365: PetscFunctionReturn(PETSC_SUCCESS);
4366: }
4368: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4369: {
4370: PetscBool dev_ij = PETSC_FALSE;
4371: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4372: PetscInt *i, *j;
4373: PetscContainer container_h;
4374: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4376: PetscFunctionBegin;
4377: PetscCall(PetscGetMemType(coo_i, &mtype));
4378: if (PetscMemTypeDevice(mtype)) {
4379: dev_ij = PETSC_TRUE;
4380: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4381: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4382: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4383: } else {
4384: i = coo_i;
4385: j = coo_j;
4386: }
4388: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4389: if (dev_ij) PetscCall(PetscFree2(i, j));
4390: mat->offloadmask = PETSC_OFFLOAD_CPU;
4391: // Create the GPU memory
4392: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4394: // Copy the COO struct to device
4395: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4396: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4397: PetscCall(PetscMalloc1(1, &coo_d));
4398: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4399: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4400: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4401: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4402: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4404: // Put the COO struct in a container and then attach that to the matrix
4405: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4406: PetscFunctionReturn(PETSC_SUCCESS);
4407: }
4409: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4410: {
4411: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4412: const PetscCount grid_size = gridDim.x * blockDim.x;
4413: for (; i < nnz; i += grid_size) {
4414: PetscScalar sum = 0.0;
4415: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4416: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4417: }
4418: }
4420: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4421: {
4422: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4423: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4424: PetscCount Annz = seq->nz;
4425: PetscMemType memtype;
4426: const PetscScalar *v1 = v;
4427: PetscScalar *Aa;
4428: PetscContainer container;
4429: MatCOOStruct_SeqAIJ *coo;
4431: PetscFunctionBegin;
4432: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4434: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4435: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4437: PetscCall(PetscGetMemType(v, &memtype));
4438: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4439: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4440: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4441: }
4443: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4444: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4446: PetscCall(PetscLogGpuTimeBegin());
4447: if (Annz) {
4448: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4449: PetscCallCUDA(cudaPeekAtLastError());
4450: }
4451: PetscCall(PetscLogGpuTimeEnd());
4453: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4454: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4456: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4457: PetscFunctionReturn(PETSC_SUCCESS);
4458: }
4460: /*@C
4461: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4463: Not Collective
4465: Input Parameters:
4466: + A - the matrix
4467: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4469: Output Parameters:
4470: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4471: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4473: Level: developer
4475: Note:
4476: When compressed is true, the CSR structure does not contain empty rows
4478: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4479: @*/
4480: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4481: {
4482: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4483: CsrMatrix *csr;
4484: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4486: PetscFunctionBegin;
4488: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4489: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4490: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4491: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4492: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4493: csr = (CsrMatrix *)cusp->mat->mat;
4494: if (i) {
4495: if (!compressed && a->compressedrow.use) { /* need full row offset */
4496: if (!cusp->rowoffsets_gpu) {
4497: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4498: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4499: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4500: }
4501: *i = cusp->rowoffsets_gpu->data().get();
4502: } else *i = csr->row_offsets->data().get();
4503: }
4504: if (j) *j = csr->column_indices->data().get();
4505: PetscFunctionReturn(PETSC_SUCCESS);
4506: }
4508: /*@C
4509: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4511: Not Collective
4513: Input Parameters:
4514: + A - the matrix
4515: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4516: . i - the CSR row pointers
4517: - j - the CSR column indices
4519: Level: developer
4521: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4522: @*/
4523: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4524: {
4525: PetscFunctionBegin;
4527: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4528: if (i) *i = NULL;
4529: if (j) *j = NULL;
4530: (void)compressed;
4531: PetscFunctionReturn(PETSC_SUCCESS);
4532: }
4534: /*@C
4535: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4537: Not Collective
4539: Input Parameter:
4540: . A - a `MATSEQAIJCUSPARSE` matrix
4542: Output Parameter:
4543: . a - pointer to the device data
4545: Level: developer
4547: Note:
4548: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4550: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4551: @*/
4552: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4553: {
4554: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4555: CsrMatrix *csr;
4557: PetscFunctionBegin;
4559: PetscAssertPointer(a, 2);
4560: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4561: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4562: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4563: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4564: csr = (CsrMatrix *)cusp->mat->mat;
4565: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4566: *a = csr->values->data().get();
4567: PetscFunctionReturn(PETSC_SUCCESS);
4568: }
4570: /*@C
4571: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4573: Not Collective
4575: Input Parameters:
4576: + A - a `MATSEQAIJCUSPARSE` matrix
4577: - a - pointer to the device data
4579: Level: developer
4581: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4582: @*/
4583: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4584: {
4585: PetscFunctionBegin;
4587: PetscAssertPointer(a, 2);
4588: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4589: *a = NULL;
4590: PetscFunctionReturn(PETSC_SUCCESS);
4591: }
4593: /*@C
4594: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4596: Not Collective
4598: Input Parameter:
4599: . A - a `MATSEQAIJCUSPARSE` matrix
4601: Output Parameter:
4602: . a - pointer to the device data
4604: Level: developer
4606: Note:
4607: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4609: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4610: @*/
4611: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4612: {
4613: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4614: CsrMatrix *csr;
4616: PetscFunctionBegin;
4618: PetscAssertPointer(a, 2);
4619: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4620: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4621: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4622: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4623: csr = (CsrMatrix *)cusp->mat->mat;
4624: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4625: *a = csr->values->data().get();
4626: A->offloadmask = PETSC_OFFLOAD_GPU;
4627: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4628: PetscFunctionReturn(PETSC_SUCCESS);
4629: }
4630: /*@C
4631: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4633: Not Collective
4635: Input Parameters:
4636: + A - a `MATSEQAIJCUSPARSE` matrix
4637: - a - pointer to the device data
4639: Level: developer
4641: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4642: @*/
4643: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4644: {
4645: PetscFunctionBegin;
4647: PetscAssertPointer(a, 2);
4648: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4649: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4650: *a = NULL;
4651: PetscFunctionReturn(PETSC_SUCCESS);
4652: }
4654: /*@C
4655: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4657: Not Collective
4659: Input Parameter:
4660: . A - a `MATSEQAIJCUSPARSE` matrix
4662: Output Parameter:
4663: . a - pointer to the device data
4665: Level: developer
4667: Note:
4668: Does not trigger any host to device copies.
4670: It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4672: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4673: @*/
4674: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4675: {
4676: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4677: CsrMatrix *csr;
4679: PetscFunctionBegin;
4681: PetscAssertPointer(a, 2);
4682: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4683: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4684: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4685: csr = (CsrMatrix *)cusp->mat->mat;
4686: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4687: *a = csr->values->data().get();
4688: A->offloadmask = PETSC_OFFLOAD_GPU;
4689: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4690: PetscFunctionReturn(PETSC_SUCCESS);
4691: }
4693: /*@C
4694: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4696: Not Collective
4698: Input Parameters:
4699: + A - a `MATSEQAIJCUSPARSE` matrix
4700: - a - pointer to the device data
4702: Level: developer
4704: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4705: @*/
4706: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4707: {
4708: PetscFunctionBegin;
4710: PetscAssertPointer(a, 2);
4711: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4712: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4713: *a = NULL;
4714: PetscFunctionReturn(PETSC_SUCCESS);
4715: }
4717: struct IJCompare4 {
4718: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4719: {
4720: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4721: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4722: return false;
4723: }
4724: };
4726: struct Shift {
4727: int _shift;
4729: Shift(int shift) : _shift(shift) { }
4730: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4731: };
4733: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4734: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4735: {
4736: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4737: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4738: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4739: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4740: PetscInt Annz, Bnnz;
4741: cusparseStatus_t stat;
4742: PetscInt i, m, n, zero = 0;
4744: PetscFunctionBegin;
4747: PetscAssertPointer(C, 4);
4748: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4749: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4750: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4751: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4752: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4753: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4754: if (reuse == MAT_INITIAL_MATRIX) {
4755: m = A->rmap->n;
4756: n = A->cmap->n + B->cmap->n;
4757: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4758: PetscCall(MatSetSizes(*C, m, n, m, n));
4759: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4760: c = (Mat_SeqAIJ *)(*C)->data;
4761: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4762: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4763: Ccsr = new CsrMatrix;
4764: Cmat->cprowIndices = NULL;
4765: c->compressedrow.use = PETSC_FALSE;
4766: c->compressedrow.nrows = 0;
4767: c->compressedrow.i = NULL;
4768: c->compressedrow.rindex = NULL;
4769: Ccusp->workVector = NULL;
4770: Ccusp->nrows = m;
4771: Ccusp->mat = Cmat;
4772: Ccusp->mat->mat = Ccsr;
4773: Ccsr->num_rows = m;
4774: Ccsr->num_cols = n;
4775: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4776: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4777: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4778: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4779: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4780: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4781: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4782: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4783: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4784: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4785: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4786: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4787: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4789: Acsr = (CsrMatrix *)Acusp->mat->mat;
4790: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4791: Annz = (PetscInt)Acsr->column_indices->size();
4792: Bnnz = (PetscInt)Bcsr->column_indices->size();
4793: c->nz = Annz + Bnnz;
4794: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4795: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4796: Ccsr->values = new THRUSTARRAY(c->nz);
4797: Ccsr->num_entries = c->nz;
4798: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4799: if (c->nz) {
4800: auto Acoo = new THRUSTINTARRAY32(Annz);
4801: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4802: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4803: THRUSTINTARRAY32 *Aroff, *Broff;
4805: if (a->compressedrow.use) { /* need full row offset */
4806: if (!Acusp->rowoffsets_gpu) {
4807: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4808: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4809: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4810: }
4811: Aroff = Acusp->rowoffsets_gpu;
4812: } else Aroff = Acsr->row_offsets;
4813: if (b->compressedrow.use) { /* need full row offset */
4814: if (!Bcusp->rowoffsets_gpu) {
4815: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4816: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4817: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4818: }
4819: Broff = Bcusp->rowoffsets_gpu;
4820: } else Broff = Bcsr->row_offsets;
4821: PetscCall(PetscLogGpuTimeBegin());
4822: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4823: PetscCallCUSPARSE(stat);
4824: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4825: PetscCallCUSPARSE(stat);
4826: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4827: auto Aperm = thrust::make_constant_iterator(1);
4828: auto Bperm = thrust::make_constant_iterator(0);
4829: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4830: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4831: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4832: #else
4833: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4834: auto Bcib = Bcsr->column_indices->begin();
4835: auto Bcie = Bcsr->column_indices->end();
4836: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4837: #endif
4838: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4839: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4840: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4841: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4842: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4843: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4844: auto p1 = Ccusp->coords->begin();
4845: auto p2 = Ccusp->coords->begin();
4846: thrust::advance(p2, Annz);
4847: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4848: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4849: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4850: #endif
4851: auto cci = thrust::make_counting_iterator(zero);
4852: auto cce = thrust::make_counting_iterator(c->nz);
4853: #if 0 //Errors on SUMMIT cuda 11.1.0
4854: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4855: #else
4856: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4857: auto pred = thrust::identity<int>();
4858: #else
4859: auto pred = cuda::std::identity();
4860: #endif
4861: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4862: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4863: #endif
4864: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4865: PetscCallCUSPARSE(stat);
4866: PetscCall(PetscLogGpuTimeEnd());
4867: delete wPerm;
4868: delete Acoo;
4869: delete Bcoo;
4870: delete Ccoo;
4871: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4872: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4873: PetscCallCUSPARSE(stat);
4874: #endif
4875: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4876: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4877: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4878: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4879: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4880: CsrMatrix *CcsrT = new CsrMatrix;
4881: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4882: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4884: (*C)->form_explicit_transpose = PETSC_TRUE;
4885: (*C)->transupdated = PETSC_TRUE;
4886: Ccusp->rowoffsets_gpu = NULL;
4887: CmatT->cprowIndices = NULL;
4888: CmatT->mat = CcsrT;
4889: CcsrT->num_rows = n;
4890: CcsrT->num_cols = m;
4891: CcsrT->num_entries = c->nz;
4893: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4894: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4895: CcsrT->values = new THRUSTARRAY(c->nz);
4897: PetscCall(PetscLogGpuTimeBegin());
4898: auto rT = CcsrT->row_offsets->begin();
4899: if (AT) {
4900: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4901: thrust::advance(rT, -1);
4902: }
4903: if (BT) {
4904: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4905: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4906: thrust::copy(titb, tite, rT);
4907: }
4908: auto cT = CcsrT->column_indices->begin();
4909: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4910: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4911: auto vT = CcsrT->values->begin();
4912: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4913: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4914: PetscCall(PetscLogGpuTimeEnd());
4916: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4917: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4918: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4919: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4920: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4921: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4922: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4923: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4924: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4925: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4926: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4927: PetscCallCUSPARSE(stat);
4928: #endif
4929: Ccusp->matTranspose = CmatT;
4930: }
4931: }
4933: c->free_a = PETSC_TRUE;
4934: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4935: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4936: c->free_ij = PETSC_TRUE;
4937: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4938: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4939: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4940: ii = *Ccsr->row_offsets;
4941: jj = *Ccsr->column_indices;
4942: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4943: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4944: } else {
4945: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4946: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4947: }
4948: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4949: PetscCall(PetscMalloc1(m, &c->ilen));
4950: PetscCall(PetscMalloc1(m, &c->imax));
4951: c->maxnz = c->nz;
4952: c->nonzerorowcnt = 0;
4953: c->rmax = 0;
4954: for (i = 0; i < m; i++) {
4955: const PetscInt nn = c->i[i + 1] - c->i[i];
4956: c->ilen[i] = c->imax[i] = nn;
4957: c->nonzerorowcnt += (PetscInt)!!nn;
4958: c->rmax = PetscMax(c->rmax, nn);
4959: }
4960: PetscCall(PetscMalloc1(c->nz, &c->a));
4961: (*C)->nonzerostate++;
4962: PetscCall(PetscLayoutSetUp((*C)->rmap));
4963: PetscCall(PetscLayoutSetUp((*C)->cmap));
4964: Ccusp->nonzerostate = (*C)->nonzerostate;
4965: (*C)->preallocated = PETSC_TRUE;
4966: } else {
4967: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4968: c = (Mat_SeqAIJ *)(*C)->data;
4969: if (c->nz) {
4970: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4971: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4972: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4973: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4974: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4975: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4976: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4977: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4978: Acsr = (CsrMatrix *)Acusp->mat->mat;
4979: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4980: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4981: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4982: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4983: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4984: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4985: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4986: auto pmid = Ccusp->coords->begin();
4987: thrust::advance(pmid, Acsr->num_entries);
4988: PetscCall(PetscLogGpuTimeBegin());
4989: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4990: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4991: thrust::for_each(zibait, zieait, VecCUDAEquals());
4992: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4993: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4994: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4995: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4996: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4997: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4998: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4999: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5000: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5001: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5002: auto vT = CcsrT->values->begin();
5003: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5004: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
5005: (*C)->transupdated = PETSC_TRUE;
5006: }
5007: PetscCall(PetscLogGpuTimeEnd());
5008: }
5009: }
5010: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5011: (*C)->assembled = PETSC_TRUE;
5012: (*C)->was_assembled = PETSC_FALSE;
5013: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
5014: PetscFunctionReturn(PETSC_SUCCESS);
5015: }
5017: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5018: {
5019: bool dmem;
5020: const PetscScalar *av;
5022: PetscFunctionBegin;
5023: dmem = isCudaMem(v);
5024: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5025: if (n && idx) {
5026: THRUSTINTARRAY widx(n);
5027: widx.assign(idx, idx + n);
5028: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5030: THRUSTARRAY *w = NULL;
5031: thrust::device_ptr<PetscScalar> dv;
5032: if (dmem) {
5033: dv = thrust::device_pointer_cast(v);
5034: } else {
5035: w = new THRUSTARRAY(n);
5036: dv = w->data();
5037: }
5038: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5040: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5041: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5042: thrust::for_each(zibit, zieit, VecCUDAEquals());
5043: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5044: delete w;
5045: } else {
5046: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5047: }
5048: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5049: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5050: PetscFunctionReturn(PETSC_SUCCESS);
5051: }