Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/vecimpl.h>
12: #undef VecType
13: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14: #include <thrust/adjacent_difference.h>
15: #if PETSC_CPP_VERSION >= 14
16: #define PETSC_HAVE_THRUST_ASYNC 1
17: // thrust::for_each(thrust::cuda::par.on()) requires C++14
18: #endif
19: #include <thrust/iterator/constant_iterator.h>
20: #include <thrust/remove.h>
21: #include <thrust/sort.h>
22: #include <thrust/unique.h>
23: #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST)
24: #include <cuda/std/functional>
25: #endif
27: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
28: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29: /*
30: The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
31: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
32: */
33: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
34: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
35: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
36: #endif
38: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
39: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
40: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
41: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
42: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
43: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
44: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
45: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
46: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
47: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
48: #endif
49: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
50: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
51: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
52: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
53: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
54: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
55: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
56: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
57: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
58: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
60: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
61: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
62: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
63: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
65: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
66: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
68: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
69: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
70: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
72: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
73: {
74: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
76: PetscFunctionBegin;
77: switch (op) {
78: case MAT_CUSPARSE_MULT:
79: cusparsestruct->format = format;
80: break;
81: case MAT_CUSPARSE_ALL:
82: cusparsestruct->format = format;
83: break;
84: default:
85: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
86: }
87: PetscFunctionReturn(PETSC_SUCCESS);
88: }
90: /*@
91: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
92: operation. Only the `MatMult()` operation can use different GPU storage formats
94: Not Collective
96: Input Parameters:
97: + A - Matrix of type `MATSEQAIJCUSPARSE`
98: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
99: `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
100: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
102: Level: intermediate
104: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
105: @*/
106: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
107: {
108: PetscFunctionBegin;
110: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
115: {
116: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
118: PetscFunctionBegin;
119: cusparsestruct->use_cpu_solve = use_cpu;
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /*@
124: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
126: Input Parameters:
127: + A - Matrix of type `MATSEQAIJCUSPARSE`
128: - use_cpu - set flag for using the built-in CPU `MatSolve()`
130: Level: intermediate
132: Note:
133: The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method
134: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there.
135: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
137: .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
138: @*/
139: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
140: {
141: PetscFunctionBegin;
143: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
144: PetscFunctionReturn(PETSC_SUCCESS);
145: }
147: static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
148: {
149: PetscFunctionBegin;
150: switch (op) {
151: case MAT_FORM_EXPLICIT_TRANSPOSE:
152: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
153: if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
154: A->form_explicit_transpose = flg;
155: break;
156: default:
157: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
158: break;
159: }
160: PetscFunctionReturn(PETSC_SUCCESS);
161: }
163: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
164: {
165: MatCUSPARSEStorageFormat format;
166: PetscBool flg;
167: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
169: PetscFunctionBegin;
170: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
171: if (A->factortype == MAT_FACTOR_NONE) {
172: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
173: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
175: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
176: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
177: PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
178: if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
179: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
180: PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
181: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
182: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
183: PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
184: #else
185: PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
186: #endif
187: PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
188: PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
190: PetscCall(
191: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
192: PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
193: #endif
194: }
195: PetscOptionsHeadEnd();
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
200: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
201: {
202: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
203: PetscInt m = A->rmap->n;
204: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
205: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
206: const MatScalar *Aa = a->a;
207: PetscInt *Mi, *Mj, Mnz;
208: PetscScalar *Ma;
210: PetscFunctionBegin;
211: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
212: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
213: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
214: Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
215: PetscCall(PetscMalloc1(m + 1, &Mi));
216: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
217: PetscCall(PetscMalloc1(Mnz, &Ma));
218: Mi[0] = 0;
219: for (PetscInt i = 0; i < m; i++) {
220: PetscInt llen = Ai[i + 1] - Ai[i];
221: PetscInt ulen = Adiag[i] - Adiag[i + 1];
222: PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L
223: Mj[Mi[i] + llen] = i; // diagonal entry
224: PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
225: Mi[i + 1] = Mi[i] + llen + ulen;
226: }
227: // Copy M (L,U) from host to device
228: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
229: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
230: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
231: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
232: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
234: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
235: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
236: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
237: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
238: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
239: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER;
240: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT;
241: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
243: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
244: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
245: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
247: fillMode = CUSPARSE_FILL_MODE_UPPER;
248: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
249: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
250: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
251: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
253: // Allocate work vectors in SpSv
254: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
255: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
257: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
258: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
260: // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
261: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
262: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
263: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
264: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
265: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
266: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
268: // Record for reuse
269: fs->csrRowPtr_h = Mi;
270: fs->csrVal_h = Ma;
271: PetscCall(PetscFree(Mj));
272: }
273: // Copy the value
274: Mi = fs->csrRowPtr_h;
275: Ma = fs->csrVal_h;
276: Mnz = Mi[m];
277: for (PetscInt i = 0; i < m; i++) {
278: PetscInt llen = Ai[i + 1] - Ai[i];
279: PetscInt ulen = Adiag[i] - Adiag[i + 1];
280: PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L
281: Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry
282: PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
283: }
284: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
286: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
287: if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
288: // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
289: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
290: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
291: } else
292: #endif
293: {
294: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
295: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
297: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
298: fs->updatedSpSVAnalysis = PETSC_TRUE;
299: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
300: }
301: }
302: PetscFunctionReturn(PETSC_SUCCESS);
303: }
304: #else
305: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
306: {
307: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
308: PetscInt n = A->rmap->n;
309: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
310: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
311: const PetscInt *ai = a->i, *aj = a->j, *vi;
312: const MatScalar *aa = a->a, *v;
313: PetscInt *AiLo, *AjLo;
314: PetscInt i, nz, nzLower, offset, rowOffset;
316: PetscFunctionBegin;
317: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
318: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
319: try {
320: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
321: nzLower = n + ai[n] - ai[1];
322: if (!loTriFactor) {
323: PetscScalar *AALo;
325: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
327: /* Allocate Space for the lower triangular matrix */
328: PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
329: PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
331: /* Fill the lower triangular matrix */
332: AiLo[0] = (PetscInt)0;
333: AiLo[n] = nzLower;
334: AjLo[0] = (PetscInt)0;
335: AALo[0] = (MatScalar)1.0;
336: v = aa;
337: vi = aj;
338: offset = 1;
339: rowOffset = 1;
340: for (i = 1; i < n; i++) {
341: nz = ai[i + 1] - ai[i];
342: /* additional 1 for the term on the diagonal */
343: AiLo[i] = rowOffset;
344: rowOffset += nz + 1;
346: PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
347: PetscCall(PetscArraycpy(&AALo[offset], v, nz));
349: offset += nz;
350: AjLo[offset] = (PetscInt)i;
351: AALo[offset] = (MatScalar)1.0;
352: offset += 1;
354: v += nz;
355: vi += nz;
356: }
358: /* allocate space for the triangular factor information */
359: PetscCall(PetscNew(&loTriFactor));
360: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
361: /* Create the matrix description */
362: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
363: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
364: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
365: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
366: #else
367: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
368: #endif
369: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
370: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
372: /* set the operation */
373: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
375: /* set the matrix */
376: loTriFactor->csrMat = new CsrMatrix;
377: loTriFactor->csrMat->num_rows = n;
378: loTriFactor->csrMat->num_cols = n;
379: loTriFactor->csrMat->num_entries = nzLower;
381: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
382: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
384: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
385: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
387: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
388: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
390: /* Create the solve analysis information */
391: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
392: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
393: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
394: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
395: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
396: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
397: #endif
399: /* perform the solve analysis */
400: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
401: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
402: PetscCallCUDA(WaitForCUDA());
403: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
405: /* assign the pointer */
406: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
407: loTriFactor->AA_h = AALo;
408: PetscCallCUDA(cudaFreeHost(AiLo));
409: PetscCallCUDA(cudaFreeHost(AjLo));
410: PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
411: } else { /* update values only */
412: if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
413: /* Fill the lower triangular matrix */
414: loTriFactor->AA_h[0] = 1.0;
415: v = aa;
416: vi = aj;
417: offset = 1;
418: for (i = 1; i < n; i++) {
419: nz = ai[i + 1] - ai[i];
420: PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
421: offset += nz;
422: loTriFactor->AA_h[offset] = 1.0;
423: offset += 1;
424: v += nz;
425: }
426: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
427: PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
428: }
429: } catch (char *ex) {
430: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
431: }
432: }
433: PetscFunctionReturn(PETSC_SUCCESS);
434: }
436: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
437: {
438: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
439: PetscInt n = A->rmap->n;
440: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
441: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
442: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
443: const MatScalar *aa = a->a, *v;
444: PetscInt *AiUp, *AjUp;
445: PetscInt i, nz, nzUpper, offset;
447: PetscFunctionBegin;
448: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
449: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
450: try {
451: /* next, figure out the number of nonzeros in the upper triangular matrix. */
452: nzUpper = adiag[0] - adiag[n];
453: if (!upTriFactor) {
454: PetscScalar *AAUp;
456: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
458: /* Allocate Space for the upper triangular matrix */
459: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
460: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
462: /* Fill the upper triangular matrix */
463: AiUp[0] = (PetscInt)0;
464: AiUp[n] = nzUpper;
465: offset = nzUpper;
466: for (i = n - 1; i >= 0; i--) {
467: v = aa + adiag[i + 1] + 1;
468: vi = aj + adiag[i + 1] + 1;
470: /* number of elements NOT on the diagonal */
471: nz = adiag[i] - adiag[i + 1] - 1;
473: /* decrement the offset */
474: offset -= (nz + 1);
476: /* first, set the diagonal elements */
477: AjUp[offset] = (PetscInt)i;
478: AAUp[offset] = (MatScalar)1. / v[nz];
479: AiUp[i] = AiUp[i + 1] - (nz + 1);
481: PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
482: PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
483: }
485: /* allocate space for the triangular factor information */
486: PetscCall(PetscNew(&upTriFactor));
487: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
489: /* Create the matrix description */
490: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
491: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
492: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
493: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
494: #else
495: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
496: #endif
497: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
498: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
500: /* set the operation */
501: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
503: /* set the matrix */
504: upTriFactor->csrMat = new CsrMatrix;
505: upTriFactor->csrMat->num_rows = n;
506: upTriFactor->csrMat->num_cols = n;
507: upTriFactor->csrMat->num_entries = nzUpper;
509: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
510: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
512: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
513: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
515: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
516: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
518: /* Create the solve analysis information */
519: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
520: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
521: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
522: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
523: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
524: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
525: #endif
527: /* perform the solve analysis */
528: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
529: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
531: PetscCallCUDA(WaitForCUDA());
532: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
534: /* assign the pointer */
535: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
536: upTriFactor->AA_h = AAUp;
537: PetscCallCUDA(cudaFreeHost(AiUp));
538: PetscCallCUDA(cudaFreeHost(AjUp));
539: PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
540: } else {
541: if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
542: /* Fill the upper triangular matrix */
543: offset = nzUpper;
544: for (i = n - 1; i >= 0; i--) {
545: v = aa + adiag[i + 1] + 1;
547: /* number of elements NOT on the diagonal */
548: nz = adiag[i] - adiag[i + 1] - 1;
550: /* decrement the offset */
551: offset -= (nz + 1);
553: /* first, set the diagonal elements */
554: upTriFactor->AA_h[offset] = 1. / v[nz];
555: PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
556: }
557: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
558: PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
559: }
560: } catch (char *ex) {
561: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
562: }
563: }
564: PetscFunctionReturn(PETSC_SUCCESS);
565: }
566: #endif
568: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
569: {
570: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
571: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572: IS isrow = a->row, isicol = a->icol;
573: PetscBool row_identity, col_identity;
574: PetscInt n = A->rmap->n;
576: PetscFunctionBegin;
577: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
578: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
579: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
580: #else
581: PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
582: PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
583: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
584: #endif
586: cusparseTriFactors->nnz = a->nz;
588: A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
589: /* lower triangular indices */
590: PetscCall(ISIdentity(isrow, &row_identity));
591: if (!row_identity && !cusparseTriFactors->rpermIndices) {
592: const PetscInt *r;
594: PetscCall(ISGetIndices(isrow, &r));
595: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
596: cusparseTriFactors->rpermIndices->assign(r, r + n);
597: PetscCall(ISRestoreIndices(isrow, &r));
598: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
599: }
601: /* upper triangular indices */
602: PetscCall(ISIdentity(isicol, &col_identity));
603: if (!col_identity && !cusparseTriFactors->cpermIndices) {
604: const PetscInt *c;
606: PetscCall(ISGetIndices(isicol, &c));
607: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
608: cusparseTriFactors->cpermIndices->assign(c, c + n);
609: PetscCall(ISRestoreIndices(isicol, &c));
610: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
611: }
612: PetscFunctionReturn(PETSC_SUCCESS);
613: }
615: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
616: static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A)
617: {
618: Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data);
619: PetscInt m = A->rmap->n;
620: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
621: const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
622: const MatScalar *Aa = a->a;
623: PetscInt *Mj, Mnz;
624: PetscScalar *Ma, *D;
626: PetscFunctionBegin;
627: if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
628: if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
629: // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
630: // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
631: Mnz = Ai[m]; // Unz (with the unit diagonal)
632: PetscCall(PetscMalloc1(Mnz, &Ma));
633: PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
634: PetscCall(PetscMalloc1(m, &D)); // the diagonal
635: for (PetscInt i = 0; i < m; i++) {
636: PetscInt ulen = Ai[i + 1] - Ai[i];
637: Mj[Ai[i]] = i; // diagonal entry
638: PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
639: }
640: // Copy M (U) from host to device
641: PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
642: PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
643: PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
644: PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
645: PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
646: PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
648: // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
649: // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
650: // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
651: // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
652: // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
653: cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER;
654: cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
655: const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
657: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
658: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
659: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
661: // Allocate work vectors in SpSv
662: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
663: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
665: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
666: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
668: // Query buffer sizes for SpSV and then allocate buffers
669: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
670: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
671: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
673: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
674: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
675: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
677: // Record for reuse
678: fs->csrVal_h = Ma;
679: fs->diag_h = D;
680: PetscCall(PetscFree(Mj));
681: }
682: // Copy the value
683: Ma = fs->csrVal_h;
684: D = fs->diag_h;
685: Mnz = Ai[m];
686: for (PetscInt i = 0; i < m; i++) {
687: D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
688: Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
689: for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
690: }
691: PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
692: PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
694: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
695: if (fs->updatedSpSVAnalysis) {
696: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
697: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
698: } else
699: #endif
700: {
701: // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
702: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
703: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
704: fs->updatedSpSVAnalysis = PETSC_TRUE;
705: }
706: }
707: PetscFunctionReturn(PETSC_SUCCESS);
708: }
710: // Solve Ut D U x = b
711: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
712: {
713: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
714: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
715: const PetscScalar *barray;
716: PetscScalar *xarray;
717: thrust::device_ptr<const PetscScalar> bGPU;
718: thrust::device_ptr<PetscScalar> xGPU;
719: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
720: PetscInt m = A->rmap->n;
722: PetscFunctionBegin;
723: PetscCall(PetscLogGpuTimeBegin());
724: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
725: PetscCall(VecCUDAGetArrayRead(b, &barray));
726: xGPU = thrust::device_pointer_cast(xarray);
727: bGPU = thrust::device_pointer_cast(barray);
729: // Reorder b with the row permutation if needed, and wrap the result in fs->X
730: if (fs->rpermIndices) {
731: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
732: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
733: } else {
734: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
735: }
737: // Solve Ut Y = X
738: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
739: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
741: // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
742: // It is basically a vector element-wise multiplication, but cublas does not have it!
743: #if CCCL_VERSION >= 3001000
744: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), cuda::std::multiplies<PetscScalar>()));
745: #else
746: PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
747: #endif
749: // Solve U X = Y
750: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
751: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
752: } else {
753: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
754: }
755: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
757: // Reorder X with the column permutation if needed, and put the result back to x
758: if (fs->cpermIndices) {
759: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
760: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
761: }
763: PetscCall(VecCUDARestoreArrayRead(b, &barray));
764: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
765: PetscCall(PetscLogGpuTimeEnd());
766: PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
767: PetscFunctionReturn(PETSC_SUCCESS);
768: }
769: #else
770: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
771: {
772: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
773: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
774: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
775: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
776: PetscInt *AiUp, *AjUp;
777: PetscScalar *AAUp;
778: PetscScalar *AALo;
779: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
780: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
781: const PetscInt *ai = b->i, *aj = b->j, *vj;
782: const MatScalar *aa = b->a, *v;
784: PetscFunctionBegin;
785: if (!n) PetscFunctionReturn(PETSC_SUCCESS);
786: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787: try {
788: PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
789: PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
790: if (!upTriFactor && !loTriFactor) {
791: /* Allocate Space for the upper triangular matrix */
792: PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
793: PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
795: /* Fill the upper triangular matrix */
796: AiUp[0] = (PetscInt)0;
797: AiUp[n] = nzUpper;
798: offset = 0;
799: for (i = 0; i < n; i++) {
800: /* set the pointers */
801: v = aa + ai[i];
802: vj = aj + ai[i];
803: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
805: /* first, set the diagonal elements */
806: AjUp[offset] = (PetscInt)i;
807: AAUp[offset] = (MatScalar)1.0 / v[nz];
808: AiUp[i] = offset;
809: AALo[offset] = (MatScalar)1.0 / v[nz];
811: offset += 1;
812: if (nz > 0) {
813: PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
814: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
815: for (j = offset; j < offset + nz; j++) {
816: AAUp[j] = -AAUp[j];
817: AALo[j] = AAUp[j] / v[nz];
818: }
819: offset += nz;
820: }
821: }
823: /* allocate space for the triangular factor information */
824: PetscCall(PetscNew(&upTriFactor));
825: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
827: /* Create the matrix description */
828: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
829: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
830: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
831: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
832: #else
833: PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
834: #endif
835: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
836: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
838: /* set the matrix */
839: upTriFactor->csrMat = new CsrMatrix;
840: upTriFactor->csrMat->num_rows = A->rmap->n;
841: upTriFactor->csrMat->num_cols = A->cmap->n;
842: upTriFactor->csrMat->num_entries = a->nz;
844: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
845: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
847: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
850: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
853: /* set the operation */
854: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
856: /* Create the solve analysis information */
857: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
858: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
859: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
860: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
861: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
862: PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
863: #endif
865: /* perform the solve analysis */
866: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
867: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
869: PetscCallCUDA(WaitForCUDA());
870: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
872: /* assign the pointer */
873: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
875: /* allocate space for the triangular factor information */
876: PetscCall(PetscNew(&loTriFactor));
877: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
879: /* Create the matrix description */
880: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
881: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
882: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
883: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
884: #else
885: PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
886: #endif
887: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
888: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
890: /* set the operation */
891: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
893: /* set the matrix */
894: loTriFactor->csrMat = new CsrMatrix;
895: loTriFactor->csrMat->num_rows = A->rmap->n;
896: loTriFactor->csrMat->num_cols = A->cmap->n;
897: loTriFactor->csrMat->num_entries = a->nz;
899: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
900: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
902: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
903: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
905: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
906: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
908: /* Create the solve analysis information */
909: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
910: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
911: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
912: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
913: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
914: PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
915: #endif
917: /* perform the solve analysis */
918: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
919: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
921: PetscCallCUDA(WaitForCUDA());
922: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
924: /* assign the pointer */
925: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
927: PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
928: PetscCallCUDA(cudaFreeHost(AiUp));
929: PetscCallCUDA(cudaFreeHost(AjUp));
930: } else {
931: /* Fill the upper triangular matrix */
932: offset = 0;
933: for (i = 0; i < n; i++) {
934: /* set the pointers */
935: v = aa + ai[i];
936: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
938: /* first, set the diagonal elements */
939: AAUp[offset] = 1.0 / v[nz];
940: AALo[offset] = 1.0 / v[nz];
942: offset += 1;
943: if (nz > 0) {
944: PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
945: for (j = offset; j < offset + nz; j++) {
946: AAUp[j] = -AAUp[j];
947: AALo[j] = AAUp[j] / v[nz];
948: }
949: offset += nz;
950: }
951: }
952: PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
953: PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
954: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
955: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
956: PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
957: }
958: PetscCallCUDA(cudaFreeHost(AAUp));
959: PetscCallCUDA(cudaFreeHost(AALo));
960: } catch (char *ex) {
961: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
962: }
963: }
964: PetscFunctionReturn(PETSC_SUCCESS);
965: }
966: #endif
968: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
969: {
970: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
971: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
972: IS ip = a->row;
973: PetscBool perm_identity;
974: PetscInt n = A->rmap->n;
976: PetscFunctionBegin;
977: PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
979: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
980: PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A));
981: #else
982: PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
983: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
984: #endif
985: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
987: A->offloadmask = PETSC_OFFLOAD_BOTH;
989: /* lower triangular indices */
990: PetscCall(ISIdentity(ip, &perm_identity));
991: if (!perm_identity) {
992: IS iip;
993: const PetscInt *irip, *rip;
995: PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
996: PetscCall(ISGetIndices(iip, &irip));
997: PetscCall(ISGetIndices(ip, &rip));
998: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
999: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1000: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
1001: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
1002: PetscCall(ISRestoreIndices(iip, &irip));
1003: PetscCall(ISDestroy(&iip));
1004: PetscCall(ISRestoreIndices(ip, &rip));
1005: PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1006: }
1007: PetscFunctionReturn(PETSC_SUCCESS);
1008: }
1010: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1011: {
1012: PetscFunctionBegin;
1013: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1014: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1015: B->offloadmask = PETSC_OFFLOAD_CPU;
1017: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1018: B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky;
1019: B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1020: #else
1021: /* determine which version of MatSolve needs to be used. */
1022: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1023: IS ip = b->row;
1024: PetscBool perm_identity;
1026: PetscCall(ISIdentity(ip, &perm_identity));
1027: if (perm_identity) {
1028: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1029: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1030: } else {
1031: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1032: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1033: }
1034: #endif
1035: B->ops->matsolve = NULL;
1036: B->ops->matsolvetranspose = NULL;
1038: /* get the triangular factors */
1039: PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1040: PetscFunctionReturn(PETSC_SUCCESS);
1041: }
1043: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1044: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1045: {
1046: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1047: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1048: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1049: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1050: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1051: cusparseIndexBase_t indexBase;
1052: cusparseMatrixType_t matrixType;
1053: cusparseFillMode_t fillMode;
1054: cusparseDiagType_t diagType;
1056: PetscFunctionBegin;
1057: /* allocate space for the transpose of the lower triangular factor */
1058: PetscCall(PetscNew(&loTriFactorT));
1059: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1061: /* set the matrix descriptors of the lower triangular factor */
1062: matrixType = cusparseGetMatType(loTriFactor->descr);
1063: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1064: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1065: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1067: /* Create the matrix description */
1068: PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1069: PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1070: PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1071: PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1072: PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1074: /* set the operation */
1075: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1077: /* allocate GPU space for the CSC of the lower triangular factor*/
1078: loTriFactorT->csrMat = new CsrMatrix;
1079: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1080: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1081: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1082: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1083: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1084: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1086: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1087: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1088: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
1089: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1091: PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1092: #endif
1094: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1095: {
1096: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1097: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
1098: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1099: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1100: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1101: #else
1102: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1103: #endif
1104: PetscCallCUSPARSE(stat);
1105: }
1107: PetscCallCUDA(WaitForCUDA());
1108: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1110: /* Create the solve analysis information */
1111: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1112: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1113: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1114: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1115: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
1116: PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1117: #endif
1119: /* perform the solve analysis */
1120: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1121: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1123: PetscCallCUDA(WaitForCUDA());
1124: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1126: /* assign the pointer */
1127: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1129: /*********************************************/
1130: /* Now the Transpose of the Upper Tri Factor */
1131: /*********************************************/
1133: /* allocate space for the transpose of the upper triangular factor */
1134: PetscCall(PetscNew(&upTriFactorT));
1135: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1137: /* set the matrix descriptors of the upper triangular factor */
1138: matrixType = cusparseGetMatType(upTriFactor->descr);
1139: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1140: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1141: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1143: /* Create the matrix description */
1144: PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1145: PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1146: PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1147: PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1148: PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1150: /* set the operation */
1151: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1153: /* allocate GPU space for the CSC of the upper triangular factor*/
1154: upTriFactorT->csrMat = new CsrMatrix;
1155: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1156: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1157: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1158: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1159: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1160: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1162: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1163: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1164: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
1165: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1167: PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1168: #endif
1170: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1171: {
1172: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
1173: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
1174: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1175: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1176: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1177: #else
1178: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1179: #endif
1180: PetscCallCUSPARSE(stat);
1181: }
1183: PetscCallCUDA(WaitForCUDA());
1184: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1186: /* Create the solve analysis information */
1187: PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1188: PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1189: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1190: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1191: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
1192: PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1193: #endif
1195: /* perform the solve analysis */
1196: /* christ, would it have killed you to put this stuff in a function????????? */
1197: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1198: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1200: PetscCallCUDA(WaitForCUDA());
1201: PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1203: /* assign the pointer */
1204: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1205: PetscFunctionReturn(PETSC_SUCCESS);
1206: }
1207: #endif
1209: struct PetscScalarToPetscInt {
1210: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1211: };
1213: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1214: {
1215: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1216: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1217: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1218: cusparseStatus_t stat;
1219: cusparseIndexBase_t indexBase;
1221: PetscFunctionBegin;
1222: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1223: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
1224: PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1225: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
1226: PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
1227: if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
1228: PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1229: PetscCall(PetscLogGpuTimeBegin());
1230: if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1231: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1233: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1234: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1235: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1236: PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1238: /* set alpha and beta */
1239: PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1240: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1241: PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
1242: PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1243: PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1244: PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1246: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247: CsrMatrix *matrixT = new CsrMatrix;
1248: matstructT->mat = matrixT;
1249: matrixT->num_rows = A->cmap->n;
1250: matrixT->num_cols = A->rmap->n;
1251: matrixT->num_entries = a->nz;
1252: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1253: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254: matrixT->values = new THRUSTARRAY(a->nz);
1256: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1257: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1259: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1260: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1261: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262: indexBase, cusparse_scalartype);
1263: PetscCallCUSPARSE(stat);
1264: #else
1265: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1266: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1268: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1269: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1270: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1271: */
1272: if (matrixT->num_entries) {
1273: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1274: PetscCallCUSPARSE(stat);
1276: } else {
1277: matstructT->matDescr = NULL;
1278: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1279: }
1280: #endif
1281: #endif
1282: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1283: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1284: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285: #else
1286: CsrMatrix *temp = new CsrMatrix;
1287: CsrMatrix *tempT = new CsrMatrix;
1288: /* First convert HYB to CSR */
1289: temp->num_rows = A->rmap->n;
1290: temp->num_cols = A->cmap->n;
1291: temp->num_entries = a->nz;
1292: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1293: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294: temp->values = new THRUSTARRAY(a->nz);
1296: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1297: PetscCallCUSPARSE(stat);
1299: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1300: tempT->num_rows = A->rmap->n;
1301: tempT->num_cols = A->cmap->n;
1302: tempT->num_entries = a->nz;
1303: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1304: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1305: tempT->values = new THRUSTARRAY(a->nz);
1307: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1308: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1309: PetscCallCUSPARSE(stat);
1311: /* Last, convert CSC to HYB */
1312: cusparseHybMat_t hybMat;
1313: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1314: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1315: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1316: PetscCallCUSPARSE(stat);
1318: /* assign the pointer */
1319: matstructT->mat = hybMat;
1320: A->transupdated = PETSC_TRUE;
1321: /* delete temporaries */
1322: if (tempT) {
1323: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1324: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1325: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1326: delete (CsrMatrix *)tempT;
1327: }
1328: if (temp) {
1329: if (temp->values) delete (THRUSTARRAY *)temp->values;
1330: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1331: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1332: delete (CsrMatrix *)temp;
1333: }
1334: #endif
1335: }
1336: }
1337: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1338: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1339: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1340: PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
1341: PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
1342: PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
1343: PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
1344: PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
1345: PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
1346: PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
1347: PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1348: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1349: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1350: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1351: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1352: }
1353: if (!cusparsestruct->csr2csc_i) {
1354: THRUSTARRAY csr2csc_a(matrix->num_entries);
1355: PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1357: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1358: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1359: void *csr2cscBuffer;
1360: size_t csr2cscBufferSize;
1361: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1362: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1363: PetscCallCUSPARSE(stat);
1364: PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1365: #endif
1367: if (matrix->num_entries) {
1368: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1369: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1370: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1373: should be filled with indexBase. So I just take a shortcut here.
1374: */
1375: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1376: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1377: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1378: PetscCallCUSPARSE(stat);
1379: #else
1380: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1381: PetscCallCUSPARSE(stat);
1382: #endif
1383: } else {
1384: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1385: }
1387: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1388: PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1389: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1390: PetscCallCUDA(cudaFree(csr2cscBuffer));
1391: #endif
1392: }
1393: PetscCallThrust(
1394: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1395: }
1396: PetscCall(PetscLogGpuTimeEnd());
1397: PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1398: /* the compressed row indices is not used for matTranspose */
1399: matstructT->cprowIndices = NULL;
1400: /* assign the pointer */
1401: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1402: A->transupdated = PETSC_TRUE;
1403: PetscFunctionReturn(PETSC_SUCCESS);
1404: }
1406: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1407: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1408: {
1409: const PetscScalar *barray;
1410: PetscScalar *xarray;
1411: thrust::device_ptr<const PetscScalar> bGPU;
1412: thrust::device_ptr<PetscScalar> xGPU;
1413: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1414: const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1415: const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE;
1416: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1417: PetscInt m = A->rmap->n;
1419: PetscFunctionBegin;
1420: PetscCall(PetscLogGpuTimeBegin());
1421: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1422: PetscCall(VecCUDAGetArrayRead(b, &barray));
1423: xGPU = thrust::device_pointer_cast(xarray);
1424: bGPU = thrust::device_pointer_cast(barray);
1426: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1427: if (fs->rpermIndices) {
1428: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1429: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1430: } else {
1431: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1432: }
1434: // Solve L Y = X
1435: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1436: // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1437: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1439: // Solve U X = Y
1440: if (fs->cpermIndices) {
1441: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1442: } else {
1443: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1444: }
1445: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1447: // Reorder X with the column permutation if needed, and put the result back to x
1448: if (fs->cpermIndices) {
1449: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1450: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1451: }
1452: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1453: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1454: PetscCall(PetscLogGpuTimeEnd());
1455: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1456: PetscFunctionReturn(PETSC_SUCCESS);
1457: }
1459: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1460: {
1461: Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1462: Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data);
1463: const PetscScalar *barray;
1464: PetscScalar *xarray;
1465: thrust::device_ptr<const PetscScalar> bGPU;
1466: thrust::device_ptr<PetscScalar> xGPU;
1467: const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE;
1468: const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT;
1469: PetscInt m = A->rmap->n;
1471: PetscFunctionBegin;
1472: PetscCall(PetscLogGpuTimeBegin());
1473: if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1474: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1475: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1476: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1478: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1479: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1480: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1481: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1482: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1483: }
1485: if (!fs->updatedTransposeSpSVAnalysis) {
1486: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1488: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1489: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1490: }
1492: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1493: PetscCall(VecCUDAGetArrayRead(b, &barray));
1494: xGPU = thrust::device_pointer_cast(xarray);
1495: bGPU = thrust::device_pointer_cast(barray);
1497: // Reorder b with the row permutation if needed, and wrap the result in fs->X
1498: if (fs->rpermIndices) {
1499: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1500: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1501: } else {
1502: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1503: }
1505: // Solve Ut Y = X
1506: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1507: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1509: // Solve Lt X = Y
1510: if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1511: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1512: } else {
1513: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1514: }
1515: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1517: // Reorder X with the column permutation if needed, and put the result back to x
1518: if (fs->cpermIndices) {
1519: PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1520: thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1521: }
1523: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1524: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1525: PetscCall(PetscLogGpuTimeEnd());
1526: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1527: PetscFunctionReturn(PETSC_SUCCESS);
1528: }
1529: #else
1530: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1531: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1532: {
1533: PetscInt n = xx->map->n;
1534: const PetscScalar *barray;
1535: PetscScalar *xarray;
1536: thrust::device_ptr<const PetscScalar> bGPU;
1537: thrust::device_ptr<PetscScalar> xGPU;
1538: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1539: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1540: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1541: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1543: PetscFunctionBegin;
1544: /* Analyze the matrix and create the transpose ... on the fly */
1545: if (!loTriFactorT && !upTriFactorT) {
1546: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1547: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1548: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1549: }
1551: /* Get the GPU pointers */
1552: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1553: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1554: xGPU = thrust::device_pointer_cast(xarray);
1555: bGPU = thrust::device_pointer_cast(barray);
1557: PetscCall(PetscLogGpuTimeBegin());
1558: /* First, reorder with the row permutation */
1559: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1561: /* First, solve U */
1562: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1563: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1565: /* Then, solve L */
1566: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1567: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1569: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1570: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1572: /* Copy the temporary to the full solution. */
1573: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1575: /* restore */
1576: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1577: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1578: PetscCall(PetscLogGpuTimeEnd());
1579: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1580: PetscFunctionReturn(PETSC_SUCCESS);
1581: }
1583: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1584: {
1585: const PetscScalar *barray;
1586: PetscScalar *xarray;
1587: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1588: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1589: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1590: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1592: PetscFunctionBegin;
1593: /* Analyze the matrix and create the transpose ... on the fly */
1594: if (!loTriFactorT && !upTriFactorT) {
1595: PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1596: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1597: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1598: }
1600: /* Get the GPU pointers */
1601: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1602: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1604: PetscCall(PetscLogGpuTimeBegin());
1605: /* First, solve U */
1606: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1607: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1609: /* Then, solve L */
1610: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1611: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1613: /* restore */
1614: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1615: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1616: PetscCall(PetscLogGpuTimeEnd());
1617: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1618: PetscFunctionReturn(PETSC_SUCCESS);
1619: }
1621: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1622: {
1623: const PetscScalar *barray;
1624: PetscScalar *xarray;
1625: thrust::device_ptr<const PetscScalar> bGPU;
1626: thrust::device_ptr<PetscScalar> xGPU;
1627: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1628: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1629: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1630: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1632: PetscFunctionBegin;
1633: /* Get the GPU pointers */
1634: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1635: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1636: xGPU = thrust::device_pointer_cast(xarray);
1637: bGPU = thrust::device_pointer_cast(barray);
1639: PetscCall(PetscLogGpuTimeBegin());
1640: /* First, reorder with the row permutation */
1641: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1643: /* Next, solve L */
1644: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1645: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1647: /* Then, solve U */
1648: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1649: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1651: /* Last, reorder with the column permutation */
1652: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1654: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1655: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1656: PetscCall(PetscLogGpuTimeEnd());
1657: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1658: PetscFunctionReturn(PETSC_SUCCESS);
1659: }
1661: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1662: {
1663: const PetscScalar *barray;
1664: PetscScalar *xarray;
1665: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1666: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1667: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1668: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1670: PetscFunctionBegin;
1671: /* Get the GPU pointers */
1672: PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
1673: PetscCall(VecCUDAGetArrayRead(bb, &barray));
1675: PetscCall(PetscLogGpuTimeBegin());
1676: /* First, solve L */
1677: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1678: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1680: /* Next, solve U */
1681: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1682: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1684: PetscCall(VecCUDARestoreArrayRead(bb, &barray));
1685: PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
1686: PetscCall(PetscLogGpuTimeEnd());
1687: PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1688: PetscFunctionReturn(PETSC_SUCCESS);
1689: }
1690: #endif
1692: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1693: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1694: {
1695: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1696: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1697: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1698: CsrMatrix *Acsr;
1699: PetscInt m, nz;
1700: PetscBool flg;
1702: PetscFunctionBegin;
1703: if (PetscDefined(USE_DEBUG)) {
1704: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1705: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1706: }
1708: /* Copy A's value to fact */
1709: m = fact->rmap->n;
1710: nz = aij->nz;
1711: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1712: Acsr = (CsrMatrix *)Acusp->mat->mat;
1713: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1715: PetscCall(PetscLogGpuTimeBegin());
1716: /* Factorize fact inplace */
1717: if (m)
1718: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1719: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1720: if (PetscDefined(USE_DEBUG)) {
1721: int numerical_zero;
1722: cusparseStatus_t status;
1723: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1724: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1725: }
1727: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1728: if (fs->updatedSpSVAnalysis) {
1729: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1730: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1731: } else
1732: #endif
1733: {
1734: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1735: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1736: */
1737: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1739: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1741: fs->updatedSpSVAnalysis = PETSC_TRUE;
1742: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1743: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1744: }
1746: fact->offloadmask = PETSC_OFFLOAD_GPU;
1747: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1748: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1749: fact->ops->matsolve = NULL;
1750: fact->ops->matsolvetranspose = NULL;
1751: PetscCall(PetscLogGpuTimeEnd());
1752: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1753: PetscFunctionReturn(PETSC_SUCCESS);
1754: }
1756: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1757: {
1758: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1759: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1760: PetscInt m, nz;
1762: PetscFunctionBegin;
1763: if (PetscDefined(USE_DEBUG)) {
1764: PetscInt i;
1765: PetscBool flg, missing;
1767: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1768: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1769: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1770: PetscCall(MatMissingDiagonal(A, &missing, &i));
1771: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1772: }
1774: /* Free the old stale stuff */
1775: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1777: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1778: but they will not be used. Allocate them just for easy debugging.
1779: */
1780: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1782: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1783: fact->factortype = MAT_FACTOR_ILU;
1784: fact->info.factor_mallocs = 0;
1785: fact->info.fill_ratio_given = info->fill;
1786: fact->info.fill_ratio_needed = 1.0;
1788: aij->row = NULL;
1789: aij->col = NULL;
1791: /* ====================================================================== */
1792: /* Copy A's i, j to fact and also allocate the value array of fact. */
1793: /* We'll do in-place factorization on fact */
1794: /* ====================================================================== */
1795: const int *Ai, *Aj;
1797: m = fact->rmap->n;
1798: nz = aij->nz;
1800: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1801: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1802: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1803: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */
1804: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1805: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1807: /* ====================================================================== */
1808: /* Create descriptors for M, L, U */
1809: /* ====================================================================== */
1810: cusparseFillMode_t fillMode;
1811: cusparseDiagType_t diagType;
1813: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1814: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1815: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1817: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1818: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1819: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1820: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1821: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1822: */
1823: fillMode = CUSPARSE_FILL_MODE_LOWER;
1824: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1825: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1826: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1827: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829: fillMode = CUSPARSE_FILL_MODE_UPPER;
1830: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1831: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
1832: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
1833: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1835: /* ========================================================================= */
1836: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1837: /* ========================================================================= */
1838: PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1839: if (m)
1840: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1841: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1843: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1844: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1846: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1847: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1849: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1850: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1852: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1853: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1855: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1856: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1857: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1858: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1859: */
1860: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1861: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
1862: fs->spsvBuffer_L = fs->factBuffer_M;
1863: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
1864: } else {
1865: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
1866: fs->spsvBuffer_U = fs->factBuffer_M;
1867: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
1868: }
1870: /* ========================================================================== */
1871: /* Perform analysis of ilu0 on M, SpSv on L and U */
1872: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1873: /* ========================================================================== */
1874: int structural_zero;
1875: cusparseStatus_t status;
1877: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1878: if (m)
1879: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1880: fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1881: if (PetscDefined(USE_DEBUG)) {
1882: /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1883: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1884: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1885: }
1887: /* Estimate FLOPs of the numeric factorization */
1888: {
1889: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1890: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1891: PetscLogDouble flops = 0.0;
1893: PetscCall(MatMarkDiagonal_SeqAIJ(A));
1894: Ai = Aseq->i;
1895: Adiag = Aseq->diag;
1896: for (PetscInt i = 0; i < m; i++) {
1897: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1898: nzRow = Ai[i + 1] - Ai[i];
1899: nzLeft = Adiag[i] - Ai[i];
1900: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1901: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1902: */
1903: nzLeft = (nzRow - 1) / 2;
1904: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1905: }
1906: }
1907: fs->numericFactFlops = flops;
1908: }
1909: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1910: PetscFunctionReturn(PETSC_SUCCESS);
1911: }
1913: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1914: {
1915: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1916: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1917: const PetscScalar *barray;
1918: PetscScalar *xarray;
1920: PetscFunctionBegin;
1921: PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1922: PetscCall(VecCUDAGetArrayRead(b, &barray));
1923: PetscCall(PetscLogGpuTimeBegin());
1925: /* Solve L*y = b */
1926: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1927: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1928: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1929: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1931: /* Solve Lt*x = y */
1932: PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1933: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1934: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1936: PetscCall(VecCUDARestoreArrayRead(b, &barray));
1937: PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1939: PetscCall(PetscLogGpuTimeEnd());
1940: PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1941: PetscFunctionReturn(PETSC_SUCCESS);
1942: }
1944: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1945: {
1946: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1947: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1948: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1949: CsrMatrix *Acsr;
1950: PetscInt m, nz;
1951: PetscBool flg;
1953: PetscFunctionBegin;
1954: if (PetscDefined(USE_DEBUG)) {
1955: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1956: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1957: }
1959: /* Copy A's value to fact */
1960: m = fact->rmap->n;
1961: nz = aij->nz;
1962: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1963: Acsr = (CsrMatrix *)Acusp->mat->mat;
1964: PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1966: /* Factorize fact inplace */
1967: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1968: csric02() only takes the lower triangular part of matrix A to perform factorization.
1969: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1970: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1971: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1972: */
1973: if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1974: if (PetscDefined(USE_DEBUG)) {
1975: int numerical_zero;
1976: cusparseStatus_t status;
1977: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1978: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1979: }
1981: #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1982: if (fs->updatedSpSVAnalysis) {
1983: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1984: if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1985: } else
1986: #endif
1987: {
1988: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1990: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1991: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1992: */
1993: PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1994: fs->updatedSpSVAnalysis = PETSC_TRUE;
1995: }
1997: fact->offloadmask = PETSC_OFFLOAD_GPU;
1998: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1999: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
2000: fact->ops->matsolve = NULL;
2001: fact->ops->matsolvetranspose = NULL;
2002: PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2003: PetscFunctionReturn(PETSC_SUCCESS);
2004: }
2006: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2007: {
2008: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2009: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
2010: PetscInt m, nz;
2012: PetscFunctionBegin;
2013: if (PetscDefined(USE_DEBUG)) {
2014: PetscInt i;
2015: PetscBool flg, missing;
2017: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2018: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2019: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2020: PetscCall(MatMissingDiagonal(A, &missing, &i));
2021: PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2022: }
2024: /* Free the old stale stuff */
2025: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2027: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2028: but they will not be used. Allocate them just for easy debugging.
2029: */
2030: PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2032: fact->offloadmask = PETSC_OFFLOAD_BOTH;
2033: fact->factortype = MAT_FACTOR_ICC;
2034: fact->info.factor_mallocs = 0;
2035: fact->info.fill_ratio_given = info->fill;
2036: fact->info.fill_ratio_needed = 1.0;
2038: aij->row = NULL;
2039: aij->col = NULL;
2041: /* ====================================================================== */
2042: /* Copy A's i, j to fact and also allocate the value array of fact. */
2043: /* We'll do in-place factorization on fact */
2044: /* ====================================================================== */
2045: const int *Ai, *Aj;
2047: m = fact->rmap->n;
2048: nz = aij->nz;
2050: PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2051: PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2052: PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2053: PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2054: PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2055: PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2057: /* ====================================================================== */
2058: /* Create mat descriptors for M, L */
2059: /* ====================================================================== */
2060: cusparseFillMode_t fillMode;
2061: cusparseDiagType_t diagType;
2063: PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2064: PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2065: PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2067: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2068: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2069: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2070: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2071: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2072: */
2073: fillMode = CUSPARSE_FILL_MODE_LOWER;
2074: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2075: PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
2076: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
2077: PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2079: /* ========================================================================= */
2080: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
2081: /* ========================================================================= */
2082: PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2083: if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2085: PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2086: PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2088: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2089: PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2091: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2092: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2094: PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2095: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2097: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2098: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2099: */
2100: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2101: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
2102: fs->spsvBuffer_L = fs->factBuffer_M;
2103: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
2104: } else {
2105: PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
2106: fs->spsvBuffer_Lt = fs->factBuffer_M;
2107: PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
2108: }
2110: /* ========================================================================== */
2111: /* Perform analysis of ic0 on M */
2112: /* The lower triangular part of M has the same sparsity pattern as L */
2113: /* ========================================================================== */
2114: int structural_zero;
2115: cusparseStatus_t status;
2117: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2118: if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2119: if (PetscDefined(USE_DEBUG)) {
2120: /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2121: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2122: PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2123: }
2125: /* Estimate FLOPs of the numeric factorization */
2126: {
2127: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
2128: PetscInt *Ai, nzRow, nzLeft;
2129: PetscLogDouble flops = 0.0;
2131: Ai = Aseq->i;
2132: for (PetscInt i = 0; i < m; i++) {
2133: nzRow = Ai[i + 1] - Ai[i];
2134: if (nzRow > 1) {
2135: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2136: and include the eliminated one will be updated, which incurs a multiplication and an addition.
2137: */
2138: nzLeft = (nzRow - 1) / 2;
2139: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2140: }
2141: }
2142: fs->numericFactFlops = flops;
2143: }
2144: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2145: PetscFunctionReturn(PETSC_SUCCESS);
2146: }
2147: #endif
2149: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2150: {
2151: // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2152: Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2154: PetscFunctionBegin;
2155: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2156: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2157: B->offloadmask = PETSC_OFFLOAD_CPU;
2159: if (!cusparsestruct->use_cpu_solve) {
2160: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2161: B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU;
2162: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2163: #else
2164: /* determine which version of MatSolve needs to be used. */
2165: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
2166: IS isrow = b->row, iscol = b->col;
2167: PetscBool row_identity, col_identity;
2169: PetscCall(ISIdentity(isrow, &row_identity));
2170: PetscCall(ISIdentity(iscol, &col_identity));
2171: if (row_identity && col_identity) {
2172: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2173: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2174: } else {
2175: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
2176: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2177: }
2178: #endif
2179: }
2180: B->ops->matsolve = NULL;
2181: B->ops->matsolvetranspose = NULL;
2183: /* get the triangular factors */
2184: if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2185: PetscFunctionReturn(PETSC_SUCCESS);
2186: }
2188: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2189: {
2190: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2192: PetscFunctionBegin;
2193: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2194: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2195: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2196: PetscFunctionReturn(PETSC_SUCCESS);
2197: }
2199: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2200: {
2201: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2203: PetscFunctionBegin;
2204: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2205: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2206: if (!info->factoronhost) {
2207: PetscCall(ISIdentity(isrow, &row_identity));
2208: PetscCall(ISIdentity(iscol, &col_identity));
2209: }
2210: if (!info->levels && row_identity && col_identity) {
2211: PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2212: } else
2213: #endif
2214: {
2215: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2216: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2217: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2218: }
2219: PetscFunctionReturn(PETSC_SUCCESS);
2220: }
2222: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2223: {
2224: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2226: PetscFunctionBegin;
2227: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2228: PetscBool perm_identity = PETSC_FALSE;
2229: if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2230: if (!info->levels && perm_identity) {
2231: PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2232: } else
2233: #endif
2234: {
2235: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2236: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2237: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2238: }
2239: PetscFunctionReturn(PETSC_SUCCESS);
2240: }
2242: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2243: {
2244: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246: PetscFunctionBegin;
2247: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2248: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2249: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2250: PetscFunctionReturn(PETSC_SUCCESS);
2251: }
2253: static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2254: {
2255: PetscFunctionBegin;
2256: *type = MATSOLVERCUSPARSE;
2257: PetscFunctionReturn(PETSC_SUCCESS);
2258: }
2260: /*MC
2261: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2262: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2263: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2264: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2265: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2266: algorithms are not recommended. This class does NOT support direct solver operations.
2268: Level: beginner
2270: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
2271: `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2272: M*/
2274: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2275: {
2276: PetscInt n = A->rmap->n;
2278: PetscFunctionBegin;
2279: PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2280: PetscCall(MatSetSizes(*B, n, n, n, n));
2281: (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2282: PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2284: if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2285: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2286: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2287: if (!A->boundtocpu) {
2288: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2289: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2290: } else {
2291: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2292: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
2293: }
2294: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2295: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2296: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2297: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2298: if (!A->boundtocpu) {
2299: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2300: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2301: } else {
2302: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
2303: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2304: }
2305: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2306: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2307: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2309: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2310: (*B)->canuseordering = PETSC_TRUE;
2311: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2312: PetscFunctionReturn(PETSC_SUCCESS);
2313: }
2315: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2316: {
2317: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2318: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2319: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2320: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
2321: #endif
2323: PetscFunctionBegin;
2324: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2325: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2326: if (A->factortype == MAT_FACTOR_NONE) {
2327: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
2328: PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2329: }
2330: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2331: else if (fs->csrVal) {
2332: /* We have a factorized matrix on device and are able to copy it to host */
2333: PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2334: }
2335: #endif
2336: else
2337: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2338: PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
2339: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2340: A->offloadmask = PETSC_OFFLOAD_BOTH;
2341: }
2342: PetscFunctionReturn(PETSC_SUCCESS);
2343: }
2345: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2346: {
2347: PetscFunctionBegin;
2348: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2349: *array = ((Mat_SeqAIJ *)A->data)->a;
2350: PetscFunctionReturn(PETSC_SUCCESS);
2351: }
2353: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2354: {
2355: PetscFunctionBegin;
2356: A->offloadmask = PETSC_OFFLOAD_CPU;
2357: *array = NULL;
2358: PetscFunctionReturn(PETSC_SUCCESS);
2359: }
2361: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2362: {
2363: PetscFunctionBegin;
2364: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2365: *array = ((Mat_SeqAIJ *)A->data)->a;
2366: PetscFunctionReturn(PETSC_SUCCESS);
2367: }
2369: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2370: {
2371: PetscFunctionBegin;
2372: *array = NULL;
2373: PetscFunctionReturn(PETSC_SUCCESS);
2374: }
2376: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2377: {
2378: PetscFunctionBegin;
2379: *array = ((Mat_SeqAIJ *)A->data)->a;
2380: PetscFunctionReturn(PETSC_SUCCESS);
2381: }
2383: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2384: {
2385: PetscFunctionBegin;
2386: A->offloadmask = PETSC_OFFLOAD_CPU;
2387: *array = NULL;
2388: PetscFunctionReturn(PETSC_SUCCESS);
2389: }
2391: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2392: {
2393: Mat_SeqAIJCUSPARSE *cusp;
2394: CsrMatrix *matrix;
2396: PetscFunctionBegin;
2397: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2398: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2399: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2400: PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
2401: matrix = (CsrMatrix *)cusp->mat->mat;
2403: if (i) {
2404: #if !defined(PETSC_USE_64BIT_INDICES)
2405: *i = matrix->row_offsets->data().get();
2406: #else
2407: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2408: #endif
2409: }
2410: if (j) {
2411: #if !defined(PETSC_USE_64BIT_INDICES)
2412: *j = matrix->column_indices->data().get();
2413: #else
2414: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2415: #endif
2416: }
2417: if (a) *a = matrix->values->data().get();
2418: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2419: PetscFunctionReturn(PETSC_SUCCESS);
2420: }
2422: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2423: {
2424: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2425: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2426: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2427: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2428: cusparseStatus_t stat;
2429: PetscBool both = PETSC_TRUE;
2431: PetscFunctionBegin;
2432: PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2433: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2434: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2435: CsrMatrix *matrix;
2436: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2438: PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
2439: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2440: matrix->values->assign(a->a, a->a + a->nz);
2441: PetscCallCUDA(WaitForCUDA());
2442: PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
2443: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2444: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
2445: } else {
2446: PetscInt nnz;
2447: PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2448: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
2449: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
2450: delete cusparsestruct->workVector;
2451: delete cusparsestruct->rowoffsets_gpu;
2452: cusparsestruct->workVector = NULL;
2453: cusparsestruct->rowoffsets_gpu = NULL;
2454: try {
2455: if (a->compressedrow.use) {
2456: m = a->compressedrow.nrows;
2457: ii = a->compressedrow.i;
2458: ridx = a->compressedrow.rindex;
2459: } else {
2460: m = A->rmap->n;
2461: ii = a->i;
2462: ridx = NULL;
2463: }
2464: PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
2465: if (!a->a) {
2466: nnz = ii[m];
2467: both = PETSC_FALSE;
2468: } else nnz = a->nz;
2469: PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
2471: /* create cusparse matrix */
2472: cusparsestruct->nrows = m;
2473: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2474: PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2475: PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2476: PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2478: PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2479: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2480: PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
2481: PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2482: PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2483: PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2484: PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2486: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2487: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2488: /* set the matrix */
2489: CsrMatrix *mat = new CsrMatrix;
2490: mat->num_rows = m;
2491: mat->num_cols = A->cmap->n;
2492: mat->num_entries = nnz;
2493: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2494: mat->row_offsets->assign(ii, ii + m + 1);
2496: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2497: mat->column_indices->assign(a->j, a->j + nnz);
2499: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2500: if (a->a) mat->values->assign(a->a, a->a + nnz);
2502: /* assign the pointer */
2503: matstruct->mat = mat;
2504: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2505: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2506: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2507: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2508: PetscCallCUSPARSE(stat);
2509: }
2510: #endif
2511: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2512: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2513: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2514: #else
2515: CsrMatrix *mat = new CsrMatrix;
2516: mat->num_rows = m;
2517: mat->num_cols = A->cmap->n;
2518: mat->num_entries = nnz;
2519: PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2520: mat->row_offsets->assign(ii, ii + m + 1);
2522: PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2523: mat->column_indices->assign(a->j, a->j + nnz);
2525: PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2526: if (a->a) mat->values->assign(a->a, a->a + nnz);
2528: cusparseHybMat_t hybMat;
2529: PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2530: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2531: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2532: PetscCallCUSPARSE(stat);
2533: /* assign the pointer */
2534: matstruct->mat = hybMat;
2536: if (mat) {
2537: if (mat->values) delete (THRUSTARRAY *)mat->values;
2538: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2539: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2540: delete (CsrMatrix *)mat;
2541: }
2542: #endif
2543: }
2545: /* assign the compressed row indices */
2546: if (a->compressedrow.use) {
2547: PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2548: PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2549: matstruct->cprowIndices->assign(ridx, ridx + m);
2550: tmp = m;
2551: } else {
2552: cusparsestruct->workVector = NULL;
2553: matstruct->cprowIndices = NULL;
2554: tmp = 0;
2555: }
2556: PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2558: /* assign the pointer */
2559: cusparsestruct->mat = matstruct;
2560: } catch (char *ex) {
2561: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2562: }
2563: PetscCallCUDA(WaitForCUDA());
2564: PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2565: cusparsestruct->nonzerostate = A->nonzerostate;
2566: }
2567: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2568: }
2569: PetscFunctionReturn(PETSC_SUCCESS);
2570: }
2572: struct VecCUDAPlusEquals {
2573: template <typename Tuple>
2574: __host__ __device__ void operator()(Tuple t)
2575: {
2576: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2577: }
2578: };
2580: struct VecCUDAEquals {
2581: template <typename Tuple>
2582: __host__ __device__ void operator()(Tuple t)
2583: {
2584: thrust::get<1>(t) = thrust::get<0>(t);
2585: }
2586: };
2588: struct VecCUDAEqualsReverse {
2589: template <typename Tuple>
2590: __host__ __device__ void operator()(Tuple t)
2591: {
2592: thrust::get<0>(t) = thrust::get<1>(t);
2593: }
2594: };
2596: struct MatMatCusparse {
2597: PetscBool cisdense;
2598: PetscScalar *Bt;
2599: Mat X;
2600: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2601: PetscLogDouble flops;
2602: CsrMatrix *Bcsr;
2604: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2605: cusparseSpMatDescr_t matSpBDescr;
2606: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2607: cusparseDnMatDescr_t matBDescr;
2608: cusparseDnMatDescr_t matCDescr;
2609: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2610: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2611: void *dBuffer4;
2612: void *dBuffer5;
2613: #endif
2614: size_t mmBufferSize;
2615: void *mmBuffer;
2616: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2617: cusparseSpGEMMDescr_t spgemmDesc;
2618: #endif
2619: };
2621: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2622: {
2623: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2625: PetscFunctionBegin;
2626: PetscCallCUDA(cudaFree(mmdata->Bt));
2627: delete mmdata->Bcsr;
2628: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2629: if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2630: if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2631: if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2632: if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2633: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2634: if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2635: if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2636: #endif
2637: if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2638: if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2639: #endif
2640: PetscCall(MatDestroy(&mmdata->X));
2641: PetscCall(PetscFree(data));
2642: PetscFunctionReturn(PETSC_SUCCESS);
2643: }
2645: #include <../src/mat/impls/dense/seq/dense.h>
2647: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2648: {
2649: Mat_Product *product = C->product;
2650: Mat A, B;
2651: PetscInt m, n, blda, clda;
2652: PetscBool flg, biscuda;
2653: Mat_SeqAIJCUSPARSE *cusp;
2654: cusparseStatus_t stat;
2655: cusparseOperation_t opA;
2656: const PetscScalar *barray;
2657: PetscScalar *carray;
2658: MatMatCusparse *mmdata;
2659: Mat_SeqAIJCUSPARSEMultStruct *mat;
2660: CsrMatrix *csrmat;
2662: PetscFunctionBegin;
2663: MatCheckProduct(C, 1);
2664: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2665: mmdata = (MatMatCusparse *)product->data;
2666: A = product->A;
2667: B = product->B;
2668: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2669: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2670: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2671: Instead of silently accepting the wrong answer, I prefer to raise the error */
2672: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2673: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2674: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2675: switch (product->type) {
2676: case MATPRODUCT_AB:
2677: case MATPRODUCT_PtAP:
2678: mat = cusp->mat;
2679: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2680: m = A->rmap->n;
2681: n = B->cmap->n;
2682: break;
2683: case MATPRODUCT_AtB:
2684: if (!A->form_explicit_transpose) {
2685: mat = cusp->mat;
2686: opA = CUSPARSE_OPERATION_TRANSPOSE;
2687: } else {
2688: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2689: mat = cusp->matTranspose;
2690: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2691: }
2692: m = A->cmap->n;
2693: n = B->cmap->n;
2694: break;
2695: case MATPRODUCT_ABt:
2696: case MATPRODUCT_RARt:
2697: mat = cusp->mat;
2698: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2699: m = A->rmap->n;
2700: n = B->rmap->n;
2701: break;
2702: default:
2703: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2704: }
2705: PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2706: csrmat = (CsrMatrix *)mat->mat;
2707: /* if the user passed a CPU matrix, copy the data to the GPU */
2708: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
2709: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2710: PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2712: PetscCall(MatDenseGetLDA(B, &blda));
2713: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2714: PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
2715: PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2716: } else {
2717: PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
2718: PetscCall(MatDenseGetLDA(C, &clda));
2719: }
2721: PetscCall(PetscLogGpuTimeBegin());
2722: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2723: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2724: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2725: cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2726: #else
2727: cusparseSpMatDescr_t &matADescr = mat->matDescr;
2728: #endif
2730: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2731: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2732: size_t mmBufferSize;
2733: if (mmdata->initialized && mmdata->Blda != blda) {
2734: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2735: mmdata->matBDescr = NULL;
2736: }
2737: if (!mmdata->matBDescr) {
2738: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739: mmdata->Blda = blda;
2740: }
2742: if (mmdata->initialized && mmdata->Clda != clda) {
2743: PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2744: mmdata->matCDescr = NULL;
2745: }
2746: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2747: PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2748: mmdata->Clda = clda;
2749: }
2751: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2752: if (matADescr) {
2753: PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2754: matADescr = NULL;
2755: }
2756: #endif
2758: if (!matADescr) {
2759: stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2760: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2761: PetscCallCUSPARSE(stat);
2762: }
2764: PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2766: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2767: PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2768: PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2769: mmdata->mmBufferSize = mmBufferSize;
2770: }
2772: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0
2773: PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2774: #endif
2776: mmdata->initialized = PETSC_TRUE;
2777: } else {
2778: /* to be safe, always update pointers of the mats */
2779: PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
2780: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
2781: PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2782: }
2784: /* do cusparseSpMM, which supports transpose on B */
2785: PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2786: #else
2787: PetscInt k;
2788: /* cusparseXcsrmm does not support transpose on B */
2789: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2790: cublasHandle_t cublasv2handle;
2791: cublasStatus_t cerr;
2793: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2794: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2795: PetscCallCUBLAS(cerr);
2796: blda = B->cmap->n;
2797: k = B->cmap->n;
2798: } else {
2799: k = B->rmap->n;
2800: }
2802: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2803: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2804: PetscCallCUSPARSE(stat);
2805: #endif
2806: PetscCall(PetscLogGpuTimeEnd());
2807: PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2808: PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2809: if (product->type == MATPRODUCT_RARt) {
2810: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2811: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2812: } else if (product->type == MATPRODUCT_PtAP) {
2813: PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
2814: PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2815: } else {
2816: PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2817: }
2818: if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2819: if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2820: PetscFunctionReturn(PETSC_SUCCESS);
2821: }
2823: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2824: {
2825: Mat_Product *product = C->product;
2826: Mat A, B;
2827: PetscInt m, n;
2828: PetscBool cisdense, flg;
2829: MatMatCusparse *mmdata;
2830: Mat_SeqAIJCUSPARSE *cusp;
2832: PetscFunctionBegin;
2833: MatCheckProduct(C, 1);
2834: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2835: A = product->A;
2836: B = product->B;
2837: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2838: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2839: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2840: PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2841: switch (product->type) {
2842: case MATPRODUCT_AB:
2843: m = A->rmap->n;
2844: n = B->cmap->n;
2845: PetscCall(MatSetBlockSizesFromMats(C, A, B));
2846: break;
2847: case MATPRODUCT_AtB:
2848: m = A->cmap->n;
2849: n = B->cmap->n;
2850: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2851: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2852: break;
2853: case MATPRODUCT_ABt:
2854: m = A->rmap->n;
2855: n = B->rmap->n;
2856: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2857: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2858: break;
2859: case MATPRODUCT_PtAP:
2860: m = B->cmap->n;
2861: n = B->cmap->n;
2862: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2863: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2864: break;
2865: case MATPRODUCT_RARt:
2866: m = B->rmap->n;
2867: n = B->rmap->n;
2868: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2869: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2870: break;
2871: default:
2872: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2873: }
2874: PetscCall(MatSetSizes(C, m, n, m, n));
2875: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2876: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
2877: PetscCall(MatSetType(C, MATSEQDENSECUDA));
2879: /* product data */
2880: PetscCall(PetscNew(&mmdata));
2881: mmdata->cisdense = cisdense;
2882: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2883: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2884: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2885: #endif
2886: /* for these products we need intermediate storage */
2887: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2888: PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
2889: PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2890: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2891: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2892: } else {
2893: PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2894: }
2895: }
2896: C->product->data = mmdata;
2897: C->product->destroy = MatDestroy_MatMatCusparse;
2899: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2900: PetscFunctionReturn(PETSC_SUCCESS);
2901: }
2903: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2904: {
2905: Mat_Product *product = C->product;
2906: Mat A, B;
2907: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2908: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2909: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2910: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2911: PetscBool flg;
2912: cusparseStatus_t stat;
2913: MatProductType ptype;
2914: MatMatCusparse *mmdata;
2915: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2916: cusparseSpMatDescr_t BmatSpDescr;
2917: #endif
2918: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2920: PetscFunctionBegin;
2921: MatCheckProduct(C, 1);
2922: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2923: PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
2924: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2925: mmdata = (MatMatCusparse *)C->product->data;
2926: A = product->A;
2927: B = product->B;
2928: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2929: mmdata->reusesym = PETSC_FALSE;
2930: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2931: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2932: Cmat = Ccusp->mat;
2933: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2934: Ccsr = (CsrMatrix *)Cmat->mat;
2935: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2936: goto finalize;
2937: }
2938: if (!c->nz) goto finalize;
2939: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2940: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2941: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
2942: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2943: PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2944: PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2945: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2946: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2947: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2948: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2949: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2950: PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2951: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2952: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2954: ptype = product->type;
2955: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2956: ptype = MATPRODUCT_AB;
2957: PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2958: }
2959: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2960: ptype = MATPRODUCT_AB;
2961: PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2962: }
2963: switch (ptype) {
2964: case MATPRODUCT_AB:
2965: Amat = Acusp->mat;
2966: Bmat = Bcusp->mat;
2967: break;
2968: case MATPRODUCT_AtB:
2969: Amat = Acusp->matTranspose;
2970: Bmat = Bcusp->mat;
2971: break;
2972: case MATPRODUCT_ABt:
2973: Amat = Acusp->mat;
2974: Bmat = Bcusp->matTranspose;
2975: break;
2976: default:
2977: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2978: }
2979: Cmat = Ccusp->mat;
2980: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
2981: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2982: PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2983: Acsr = (CsrMatrix *)Amat->mat;
2984: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2985: Ccsr = (CsrMatrix *)Cmat->mat;
2986: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
2987: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2988: PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2989: PetscCall(PetscLogGpuTimeBegin());
2990: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2991: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2992: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2993: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2994: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2995: PetscCallCUSPARSE(stat);
2996: #else
2997: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2998: PetscCallCUSPARSE(stat);
2999: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3000: PetscCallCUSPARSE(stat);
3001: #endif
3002: #else
3003: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3004: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3005: PetscCallCUSPARSE(stat);
3006: #endif
3007: PetscCall(PetscLogGpuFlops(mmdata->flops));
3008: PetscCallCUDA(WaitForCUDA());
3009: PetscCall(PetscLogGpuTimeEnd());
3010: C->offloadmask = PETSC_OFFLOAD_GPU;
3011: finalize:
3012: /* shorter version of MatAssemblyEnd_SeqAIJ */
3013: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
3014: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
3015: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3016: c->reallocs = 0;
3017: C->info.mallocs += 0;
3018: C->info.nz_unneeded = 0;
3019: C->assembled = C->was_assembled = PETSC_TRUE;
3020: C->num_ass++;
3021: PetscFunctionReturn(PETSC_SUCCESS);
3022: }
3024: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3025: {
3026: Mat_Product *product = C->product;
3027: Mat A, B;
3028: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
3029: Mat_SeqAIJ *a, *b, *c;
3030: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3031: CsrMatrix *Acsr, *Bcsr, *Ccsr;
3032: PetscInt i, j, m, n, k;
3033: PetscBool flg;
3034: cusparseStatus_t stat;
3035: MatProductType ptype;
3036: MatMatCusparse *mmdata;
3037: PetscLogDouble flops;
3038: PetscBool biscompressed, ciscompressed;
3039: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3040: int64_t C_num_rows1, C_num_cols1, C_nnz1;
3041: cusparseSpMatDescr_t BmatSpDescr;
3042: #else
3043: int cnz;
3044: #endif
3045: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3047: PetscFunctionBegin;
3048: MatCheckProduct(C, 1);
3049: PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3050: A = product->A;
3051: B = product->B;
3052: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
3053: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
3054: PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
3055: PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3056: a = (Mat_SeqAIJ *)A->data;
3057: b = (Mat_SeqAIJ *)B->data;
3058: /* product data */
3059: PetscCall(PetscNew(&mmdata));
3060: C->product->data = mmdata;
3061: C->product->destroy = MatDestroy_MatMatCusparse;
3063: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3064: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3065: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3066: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
3067: PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3068: PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3070: ptype = product->type;
3071: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3072: ptype = MATPRODUCT_AB;
3073: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3074: }
3075: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3076: ptype = MATPRODUCT_AB;
3077: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3078: }
3079: biscompressed = PETSC_FALSE;
3080: ciscompressed = PETSC_FALSE;
3081: switch (ptype) {
3082: case MATPRODUCT_AB:
3083: m = A->rmap->n;
3084: n = B->cmap->n;
3085: k = A->cmap->n;
3086: Amat = Acusp->mat;
3087: Bmat = Bcusp->mat;
3088: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3089: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3090: break;
3091: case MATPRODUCT_AtB:
3092: m = A->cmap->n;
3093: n = B->cmap->n;
3094: k = A->rmap->n;
3095: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3096: Amat = Acusp->matTranspose;
3097: Bmat = Bcusp->mat;
3098: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3099: break;
3100: case MATPRODUCT_ABt:
3101: m = A->rmap->n;
3102: n = B->rmap->n;
3103: k = A->cmap->n;
3104: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3105: Amat = Acusp->mat;
3106: Bmat = Bcusp->matTranspose;
3107: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3108: break;
3109: default:
3110: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3111: }
3113: /* create cusparse matrix */
3114: PetscCall(MatSetSizes(C, m, n, m, n));
3115: PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3116: c = (Mat_SeqAIJ *)C->data;
3117: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3118: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
3119: Ccsr = new CsrMatrix;
3121: c->compressedrow.use = ciscompressed;
3122: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3123: c->compressedrow.nrows = a->compressedrow.nrows;
3124: PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
3125: PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3126: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
3127: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3128: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3129: } else {
3130: c->compressedrow.nrows = 0;
3131: c->compressedrow.i = NULL;
3132: c->compressedrow.rindex = NULL;
3133: Ccusp->workVector = NULL;
3134: Cmat->cprowIndices = NULL;
3135: }
3136: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
3137: Ccusp->mat = Cmat;
3138: Ccusp->mat->mat = Ccsr;
3139: Ccsr->num_rows = Ccusp->nrows;
3140: Ccsr->num_cols = n;
3141: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
3142: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3143: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3144: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3145: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3146: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3147: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
3148: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3150: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3151: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3152: PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3153: c->nz = 0;
3154: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3155: Ccsr->values = new THRUSTARRAY(c->nz);
3156: goto finalizesym;
3157: }
3159: PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
3160: PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3161: Acsr = (CsrMatrix *)Amat->mat;
3162: if (!biscompressed) {
3163: Bcsr = (CsrMatrix *)Bmat->mat;
3164: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3165: BmatSpDescr = Bmat->matDescr;
3166: #endif
3167: } else { /* we need to use row offsets for the full matrix */
3168: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
3169: Bcsr = new CsrMatrix;
3170: Bcsr->num_rows = B->rmap->n;
3171: Bcsr->num_cols = cBcsr->num_cols;
3172: Bcsr->num_entries = cBcsr->num_entries;
3173: Bcsr->column_indices = cBcsr->column_indices;
3174: Bcsr->values = cBcsr->values;
3175: if (!Bcusp->rowoffsets_gpu) {
3176: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3177: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
3178: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3179: }
3180: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3181: mmdata->Bcsr = Bcsr;
3182: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3183: if (Bcsr->num_rows && Bcsr->num_cols) {
3184: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3185: PetscCallCUSPARSE(stat);
3186: }
3187: BmatSpDescr = mmdata->matSpBDescr;
3188: #endif
3189: }
3190: PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
3191: PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3192: /* precompute flops count */
3193: if (ptype == MATPRODUCT_AB) {
3194: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3195: const PetscInt st = a->i[i];
3196: const PetscInt en = a->i[i + 1];
3197: for (j = st; j < en; j++) {
3198: const PetscInt brow = a->j[j];
3199: flops += 2. * (b->i[brow + 1] - b->i[brow]);
3200: }
3201: }
3202: } else if (ptype == MATPRODUCT_AtB) {
3203: for (i = 0, flops = 0; i < A->rmap->n; i++) {
3204: const PetscInt anzi = a->i[i + 1] - a->i[i];
3205: const PetscInt bnzi = b->i[i + 1] - b->i[i];
3206: flops += (2. * anzi) * bnzi;
3207: }
3208: } else { /* TODO */
3209: flops = 0.;
3210: }
3212: mmdata->flops = flops;
3213: PetscCall(PetscLogGpuTimeBegin());
3215: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3216: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3217: // cuda-12.2 requires non-null csrRowOffsets
3218: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
3219: PetscCallCUSPARSE(stat);
3220: PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3221: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3222: {
3223: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3224: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3225: */
3226: void *dBuffer1 = NULL;
3227: void *dBuffer2 = NULL;
3228: void *dBuffer3 = NULL;
3229: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3230: size_t bufferSize1 = 0;
3231: size_t bufferSize2 = 0;
3232: size_t bufferSize3 = 0;
3233: size_t bufferSize4 = 0;
3234: size_t bufferSize5 = 0;
3236: /* ask bufferSize1 bytes for external memory */
3237: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
3238: PetscCallCUSPARSE(stat);
3239: PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3240: /* inspect the matrices A and B to understand the memory requirement for the next step */
3241: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
3242: PetscCallCUSPARSE(stat);
3244: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
3245: PetscCallCUSPARSE(stat);
3246: PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
3247: PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
3248: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
3249: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
3250: PetscCallCUSPARSE(stat);
3251: PetscCallCUDA(cudaFree(dBuffer1));
3252: PetscCallCUDA(cudaFree(dBuffer2));
3254: /* get matrix C non-zero entries C_nnz1 */
3255: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3256: c->nz = (PetscInt)C_nnz1;
3257: /* allocate matrix C */
3258: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3259: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260: Ccsr->values = new THRUSTARRAY(c->nz);
3261: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3262: /* update matC with the new pointers */
3263: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3264: PetscCallCUSPARSE(stat);
3266: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
3267: PetscCallCUSPARSE(stat);
3268: PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
3269: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
3270: PetscCallCUSPARSE(stat);
3271: PetscCallCUDA(cudaFree(dBuffer3));
3272: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3273: PetscCallCUSPARSE(stat);
3274: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3275: }
3276: #else
3277: size_t bufSize2;
3278: /* ask bufferSize bytes for external memory */
3279: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
3280: PetscCallCUSPARSE(stat);
3281: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3282: /* inspect the matrices A and B to understand the memory requirement for the next step */
3283: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
3284: PetscCallCUSPARSE(stat);
3285: /* ask bufferSize again bytes for external memory */
3286: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
3287: PetscCallCUSPARSE(stat);
3288: /* The CUSPARSE documentation is not clear, nor the API
3289: We need both buffers to perform the operations properly!
3290: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3291: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3292: is stored in the descriptor! What a messy API... */
3293: PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3294: /* compute the intermediate product of A * B */
3295: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
3296: PetscCallCUSPARSE(stat);
3297: /* get matrix C non-zero entries C_nnz1 */
3298: PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3299: c->nz = (PetscInt)C_nnz1;
3300: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
3301: mmdata->mmBufferSize / 1024));
3302: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3303: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3304: Ccsr->values = new THRUSTARRAY(c->nz);
3305: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3306: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
3307: PetscCallCUSPARSE(stat);
3308: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
3309: PetscCallCUSPARSE(stat);
3310: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3311: #else
3312: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3313: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3314: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
3315: PetscCallCUSPARSE(stat);
3316: c->nz = cnz;
3317: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3318: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319: Ccsr->values = new THRUSTARRAY(c->nz);
3320: PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3322: PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3323: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3324: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3325: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3326: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
3327: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
3328: PetscCallCUSPARSE(stat);
3329: #endif
3330: PetscCall(PetscLogGpuFlops(mmdata->flops));
3331: PetscCall(PetscLogGpuTimeEnd());
3332: finalizesym:
3333: c->free_a = PETSC_TRUE;
3334: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
3335: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3336: c->free_ij = PETSC_TRUE;
3337: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3338: PetscInt *d_i = c->i;
3339: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3340: THRUSTINTARRAY jj(Ccsr->column_indices->size());
3341: ii = *Ccsr->row_offsets;
3342: jj = *Ccsr->column_indices;
3343: if (ciscompressed) d_i = c->compressedrow.i;
3344: PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3345: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3346: } else {
3347: PetscInt *d_i = c->i;
3348: if (ciscompressed) d_i = c->compressedrow.i;
3349: PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3350: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3351: }
3352: if (ciscompressed) { /* need to expand host row offsets */
3353: PetscInt r = 0;
3354: c->i[0] = 0;
3355: for (k = 0; k < c->compressedrow.nrows; k++) {
3356: const PetscInt next = c->compressedrow.rindex[k];
3357: const PetscInt old = c->compressedrow.i[k];
3358: for (; r < next; r++) c->i[r + 1] = old;
3359: }
3360: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3361: }
3362: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
3363: PetscCall(PetscMalloc1(m, &c->ilen));
3364: PetscCall(PetscMalloc1(m, &c->imax));
3365: c->maxnz = c->nz;
3366: c->nonzerorowcnt = 0;
3367: c->rmax = 0;
3368: for (k = 0; k < m; k++) {
3369: const PetscInt nn = c->i[k + 1] - c->i[k];
3370: c->ilen[k] = c->imax[k] = nn;
3371: c->nonzerorowcnt += (PetscInt)!!nn;
3372: c->rmax = PetscMax(c->rmax, nn);
3373: }
3374: PetscCall(MatMarkDiagonal_SeqAIJ(C));
3375: PetscCall(PetscMalloc1(c->nz, &c->a));
3376: Ccsr->num_entries = c->nz;
3378: C->nonzerostate++;
3379: PetscCall(PetscLayoutSetUp(C->rmap));
3380: PetscCall(PetscLayoutSetUp(C->cmap));
3381: Ccusp->nonzerostate = C->nonzerostate;
3382: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3383: C->preallocated = PETSC_TRUE;
3384: C->assembled = PETSC_FALSE;
3385: C->was_assembled = PETSC_FALSE;
3386: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3387: mmdata->reusesym = PETSC_TRUE;
3388: C->offloadmask = PETSC_OFFLOAD_GPU;
3389: }
3390: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3391: PetscFunctionReturn(PETSC_SUCCESS);
3392: }
3394: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3396: /* handles sparse or dense B */
3397: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3398: {
3399: Mat_Product *product = mat->product;
3400: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3402: PetscFunctionBegin;
3403: MatCheckProduct(mat, 1);
3404: PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3405: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3406: if (product->type == MATPRODUCT_ABC) {
3407: Ciscusp = PETSC_FALSE;
3408: if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3409: }
3410: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3411: PetscBool usecpu = PETSC_FALSE;
3412: switch (product->type) {
3413: case MATPRODUCT_AB:
3414: if (product->api_user) {
3415: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3416: PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3417: PetscOptionsEnd();
3418: } else {
3419: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3420: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3421: PetscOptionsEnd();
3422: }
3423: break;
3424: case MATPRODUCT_AtB:
3425: if (product->api_user) {
3426: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3427: PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3428: PetscOptionsEnd();
3429: } else {
3430: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3431: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3432: PetscOptionsEnd();
3433: }
3434: break;
3435: case MATPRODUCT_PtAP:
3436: if (product->api_user) {
3437: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3438: PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3439: PetscOptionsEnd();
3440: } else {
3441: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3442: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3443: PetscOptionsEnd();
3444: }
3445: break;
3446: case MATPRODUCT_RARt:
3447: if (product->api_user) {
3448: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3449: PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3450: PetscOptionsEnd();
3451: } else {
3452: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3453: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3454: PetscOptionsEnd();
3455: }
3456: break;
3457: case MATPRODUCT_ABC:
3458: if (product->api_user) {
3459: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3460: PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3461: PetscOptionsEnd();
3462: } else {
3463: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3464: PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3465: PetscOptionsEnd();
3466: }
3467: break;
3468: default:
3469: break;
3470: }
3471: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3472: }
3473: /* dispatch */
3474: if (isdense) {
3475: switch (product->type) {
3476: case MATPRODUCT_AB:
3477: case MATPRODUCT_AtB:
3478: case MATPRODUCT_ABt:
3479: case MATPRODUCT_PtAP:
3480: case MATPRODUCT_RARt:
3481: if (product->A->boundtocpu) {
3482: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3483: } else {
3484: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3485: }
3486: break;
3487: case MATPRODUCT_ABC:
3488: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3489: break;
3490: default:
3491: break;
3492: }
3493: } else if (Biscusp && Ciscusp) {
3494: switch (product->type) {
3495: case MATPRODUCT_AB:
3496: case MATPRODUCT_AtB:
3497: case MATPRODUCT_ABt:
3498: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3499: break;
3500: case MATPRODUCT_PtAP:
3501: case MATPRODUCT_RARt:
3502: case MATPRODUCT_ABC:
3503: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3504: break;
3505: default:
3506: break;
3507: }
3508: } else { /* fallback for AIJ */
3509: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3510: }
3511: PetscFunctionReturn(PETSC_SUCCESS);
3512: }
3514: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3515: {
3516: PetscFunctionBegin;
3517: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3518: PetscFunctionReturn(PETSC_SUCCESS);
3519: }
3521: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3522: {
3523: PetscFunctionBegin;
3524: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3525: PetscFunctionReturn(PETSC_SUCCESS);
3526: }
3528: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3529: {
3530: PetscFunctionBegin;
3531: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3532: PetscFunctionReturn(PETSC_SUCCESS);
3533: }
3535: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3536: {
3537: PetscFunctionBegin;
3538: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
3539: PetscFunctionReturn(PETSC_SUCCESS);
3540: }
3542: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3543: {
3544: PetscFunctionBegin;
3545: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3546: PetscFunctionReturn(PETSC_SUCCESS);
3547: }
3549: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3550: {
3551: int i = blockIdx.x * blockDim.x + threadIdx.x;
3552: if (i < n) y[idx[i]] += x[i];
3553: }
3555: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3556: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3557: {
3558: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3559: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3560: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3561: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3562: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3563: PetscBool compressed;
3564: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3565: PetscInt nx, ny;
3566: #endif
3568: PetscFunctionBegin;
3569: PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3570: if (!a->nz) {
3571: if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3572: else PetscCall(VecSeq_CUDA::Set(zz, 0));
3573: PetscFunctionReturn(PETSC_SUCCESS);
3574: }
3575: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3576: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3577: if (!trans) {
3578: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3579: PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3580: } else {
3581: if (herm || !A->form_explicit_transpose) {
3582: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3583: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3584: } else {
3585: if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3586: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3587: }
3588: }
3589: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3590: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3592: try {
3593: PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
3594: if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
3595: else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */
3597: PetscCall(PetscLogGpuTimeBegin());
3598: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3599: /* z = A x + beta y.
3600: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3601: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3602: */
3603: xptr = xarray;
3604: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3605: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3606: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3607: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3608: allocated to accommodate different uses. So we get the length info directly from mat.
3609: */
3610: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3611: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3612: nx = mat->num_cols; // since y = Ax
3613: ny = mat->num_rows;
3614: }
3615: #endif
3616: } else {
3617: /* z = A^T x + beta y
3618: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3619: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3620: */
3621: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3622: dptr = zarray;
3623: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3624: if (compressed) { /* Scatter x to work vector */
3625: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3627: thrust::for_each(
3628: #if PetscDefined(HAVE_THRUST_ASYNC)
3629: thrust::cuda::par.on(PetscDefaultCudaStream),
3630: #endif
3631: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3632: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3633: }
3634: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3635: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3636: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3637: nx = mat->num_rows; // since y = A^T x
3638: ny = mat->num_cols;
3639: }
3640: #endif
3641: }
3643: /* csr_spmv does y = alpha op(A) x + beta y */
3644: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3645: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3646: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3647: cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3648: #else
3649: cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3650: #endif
3652: PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3653: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3654: if (!matDescr) {
3655: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3656: PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3657: }
3658: #endif
3660: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3661: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
3662: PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
3663: PetscCallCUSPARSE(
3664: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3665: PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3666: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3667: PetscCallCUSPARSE(
3668: cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3669: #endif
3670: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3671: } else {
3672: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3673: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
3674: PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3675: }
3677: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3678: #else
3679: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3680: PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3681: #endif
3682: } else {
3683: if (cusparsestruct->nrows) {
3684: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3685: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3686: #else
3687: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3688: PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3689: #endif
3690: }
3691: }
3692: PetscCall(PetscLogGpuTimeEnd());
3694: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3695: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3696: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3697: PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */
3698: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3699: PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3700: }
3701: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3702: PetscCall(VecSeq_CUDA::Set(zz, 0));
3703: }
3705: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3706: if (compressed) {
3707: PetscCall(PetscLogGpuTimeBegin());
3708: PetscInt n = (PetscInt)matstruct->cprowIndices->size();
3709: ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3710: PetscCall(PetscLogGpuTimeEnd());
3711: }
3712: } else {
3713: if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3714: }
3715: PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
3716: if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
3717: else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3718: } catch (char *ex) {
3719: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3720: }
3721: if (yy) {
3722: PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3723: } else {
3724: PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3725: }
3726: PetscFunctionReturn(PETSC_SUCCESS);
3727: }
3729: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3730: {
3731: PetscFunctionBegin;
3732: PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3733: PetscFunctionReturn(PETSC_SUCCESS);
3734: }
3736: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3737: {
3738: PetscFunctionBegin;
3739: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3740: PetscFunctionReturn(PETSC_SUCCESS);
3741: }
3743: /*@
3744: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs
3746: Collective
3748: Input Parameters:
3749: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3750: . m - number of rows
3751: . n - number of columns
3752: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
3753: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
3755: Output Parameter:
3756: . A - the matrix
3758: Level: intermediate
3760: Notes:
3761: This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
3762: calculations. For good matrix assembly performance the user should preallocate the matrix
3763: storage by setting the parameter `nz` (or the array `nnz`).
3765: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3766: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3767: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3769: The AIJ format, also called
3770: compressed row storage, is fully compatible with standard Fortran
3771: storage. That is, the stored row and column indices can begin at
3772: either one (as in Fortran) or zero.
3774: Specify the preallocated storage with either nz or nnz (not both).
3775: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
3776: allocation.
3778: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
3780: .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`,
3781: `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3782: @*/
3783: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3784: {
3785: PetscFunctionBegin;
3786: PetscCall(MatCreate(comm, A));
3787: PetscCall(MatSetSizes(*A, m, n, m, n));
3788: PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
3789: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
3790: PetscFunctionReturn(PETSC_SUCCESS);
3791: }
3793: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3794: {
3795: PetscFunctionBegin;
3796: if (A->factortype == MAT_FACTOR_NONE) {
3797: PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
3798: } else {
3799: PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3800: }
3801: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3802: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
3803: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
3804: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3805: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3806: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3807: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
3808: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3809: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3810: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
3811: PetscCall(MatDestroy_SeqAIJ(A));
3812: PetscFunctionReturn(PETSC_SUCCESS);
3813: }
3815: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3816: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3817: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3818: {
3819: PetscFunctionBegin;
3820: PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
3821: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
3822: PetscFunctionReturn(PETSC_SUCCESS);
3823: }
3825: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3826: {
3827: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3828: Mat_SeqAIJCUSPARSE *cy;
3829: Mat_SeqAIJCUSPARSE *cx;
3830: PetscScalar *ay;
3831: const PetscScalar *ax;
3832: CsrMatrix *csry, *csrx;
3834: PetscFunctionBegin;
3835: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3836: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3837: if (X->ops->axpy != Y->ops->axpy) {
3838: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3839: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3840: PetscFunctionReturn(PETSC_SUCCESS);
3841: }
3842: /* if we are here, it means both matrices are bound to GPU */
3843: PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3844: PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3845: PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3846: PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3847: csry = (CsrMatrix *)cy->mat->mat;
3848: csrx = (CsrMatrix *)cx->mat->mat;
3849: /* see if we can turn this into a cublas axpy */
3850: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3851: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3852: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3853: if (eq) str = SAME_NONZERO_PATTERN;
3854: }
3855: /* spgeam is buggy with one column */
3856: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3858: if (str == SUBSET_NONZERO_PATTERN) {
3859: PetscScalar b = 1.0;
3860: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3861: size_t bufferSize;
3862: void *buffer;
3863: #endif
3865: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3866: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3867: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3868: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3869: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3870: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3871: PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
3872: PetscCall(PetscLogGpuTimeBegin());
3873: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3874: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3875: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3876: PetscCall(PetscLogGpuTimeEnd());
3877: PetscCallCUDA(cudaFree(buffer));
3878: #else
3879: PetscCall(PetscLogGpuTimeBegin());
3880: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3881: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3882: PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3883: PetscCall(PetscLogGpuTimeEnd());
3884: #endif
3885: PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3886: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3887: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3888: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3889: } else if (str == SAME_NONZERO_PATTERN) {
3890: cublasHandle_t cublasv2handle;
3891: PetscBLASInt one = 1, bnz = 1;
3893: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
3894: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3895: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3896: PetscCall(PetscBLASIntCast(x->nz, &bnz));
3897: PetscCall(PetscLogGpuTimeBegin());
3898: PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
3899: PetscCall(PetscLogGpuFlops(2.0 * bnz));
3900: PetscCall(PetscLogGpuTimeEnd());
3901: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
3902: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3903: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3904: } else {
3905: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
3906: PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3907: }
3908: PetscFunctionReturn(PETSC_SUCCESS);
3909: }
3911: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3912: {
3913: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3914: PetscScalar *ay;
3915: cublasHandle_t cublasv2handle;
3916: PetscBLASInt one = 1, bnz = 1;
3918: PetscFunctionBegin;
3919: PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
3920: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3921: PetscCall(PetscBLASIntCast(y->nz, &bnz));
3922: PetscCall(PetscLogGpuTimeBegin());
3923: PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
3924: PetscCall(PetscLogGpuFlops(bnz));
3925: PetscCall(PetscLogGpuTimeEnd());
3926: PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
3927: PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3928: PetscFunctionReturn(PETSC_SUCCESS);
3929: }
3931: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3932: {
3933: PetscBool both = PETSC_FALSE;
3934: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3936: PetscFunctionBegin;
3937: if (A->factortype == MAT_FACTOR_NONE) {
3938: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3939: if (spptr->mat) {
3940: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3941: if (matrix->values) {
3942: both = PETSC_TRUE;
3943: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3944: }
3945: }
3946: if (spptr->matTranspose) {
3947: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3948: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3949: }
3950: }
3951: PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
3952: PetscCall(MatSeqAIJInvalidateDiagonal(A));
3953: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3954: else A->offloadmask = PETSC_OFFLOAD_CPU;
3955: PetscFunctionReturn(PETSC_SUCCESS);
3956: }
3958: static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m)
3959: {
3960: PetscFunctionBegin;
3961: *m = PETSC_MEMTYPE_CUDA;
3962: PetscFunctionReturn(PETSC_SUCCESS);
3963: }
3965: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3966: {
3967: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3969: PetscFunctionBegin;
3970: if (A->factortype != MAT_FACTOR_NONE) {
3971: A->boundtocpu = flg;
3972: PetscFunctionReturn(PETSC_SUCCESS);
3973: }
3974: if (flg) {
3975: PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3977: A->ops->scale = MatScale_SeqAIJ;
3978: A->ops->axpy = MatAXPY_SeqAIJ;
3979: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3980: A->ops->mult = MatMult_SeqAIJ;
3981: A->ops->multadd = MatMultAdd_SeqAIJ;
3982: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3983: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3984: A->ops->multhermitiantranspose = NULL;
3985: A->ops->multhermitiantransposeadd = NULL;
3986: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3987: A->ops->getcurrentmemtype = NULL;
3988: PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
3989: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
3990: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
3991: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
3992: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
3993: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
3994: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3995: } else {
3996: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3997: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3998: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3999: A->ops->mult = MatMult_SeqAIJCUSPARSE;
4000: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
4001: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
4002: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
4003: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4004: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4005: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
4006: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4007: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4008: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4009: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4010: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4011: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4012: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4013: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4015: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4016: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4017: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4018: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
4019: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
4020: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4021: }
4022: A->boundtocpu = flg;
4023: if (flg && a->inode.size_csr) {
4024: a->inode.use = PETSC_TRUE;
4025: } else {
4026: a->inode.use = PETSC_FALSE;
4027: }
4028: PetscFunctionReturn(PETSC_SUCCESS);
4029: }
4031: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4032: {
4033: Mat B;
4035: PetscFunctionBegin;
4036: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4037: if (reuse == MAT_INITIAL_MATRIX) {
4038: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
4039: } else if (reuse == MAT_REUSE_MATRIX) {
4040: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
4041: }
4042: B = *newmat;
4044: PetscCall(PetscFree(B->defaultvectype));
4045: PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
4047: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4048: if (B->factortype == MAT_FACTOR_NONE) {
4049: Mat_SeqAIJCUSPARSE *spptr;
4050: PetscCall(PetscNew(&spptr));
4051: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4052: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4053: spptr->format = MAT_CUSPARSE_CSR;
4054: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4055: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4056: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4057: #else
4058: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4059: #endif
4060: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4061: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4062: #endif
4063: B->spptr = spptr;
4064: } else {
4065: Mat_SeqAIJCUSPARSETriFactors *spptr;
4067: PetscCall(PetscNew(&spptr));
4068: PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4069: PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4070: B->spptr = spptr;
4071: }
4072: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4073: }
4074: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
4075: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
4076: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
4077: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4078: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
4079: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
4080: B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE;
4082: PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
4083: PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
4084: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4085: #if defined(PETSC_HAVE_HYPRE)
4086: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4087: #endif
4088: PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4089: PetscFunctionReturn(PETSC_SUCCESS);
4090: }
4092: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4093: {
4094: PetscFunctionBegin;
4095: PetscCall(MatCreate_SeqAIJ(B));
4096: PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
4097: PetscFunctionReturn(PETSC_SUCCESS);
4098: }
4100: /*MC
4101: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs.
4103: Options Database Keys:
4104: + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
4105: . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
4106: Other options include ell (ellpack) or hyb (hybrid).
4107: . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
4108: - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU
4110: Level: beginner
4112: Notes:
4113: These matrices can be in either CSR, ELL, or HYB format.
4115: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
4117: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
4118: if some integer values passed in do not fit in `int`.
4120: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4121: M*/
4123: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4124: {
4125: PetscFunctionBegin;
4126: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
4127: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
4128: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
4129: PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4130: PetscFunctionReturn(PETSC_SUCCESS);
4131: }
4133: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4134: {
4135: Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4137: PetscFunctionBegin;
4138: if (cusp) {
4139: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
4140: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4141: delete cusp->workVector;
4142: delete cusp->rowoffsets_gpu;
4143: delete cusp->csr2csc_i;
4144: delete cusp->coords;
4145: if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
4146: PetscCall(PetscFree(mat->spptr));
4147: }
4148: PetscFunctionReturn(PETSC_SUCCESS);
4149: }
4151: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4152: {
4153: PetscFunctionBegin;
4154: if (*mat) {
4155: delete (*mat)->values;
4156: delete (*mat)->column_indices;
4157: delete (*mat)->row_offsets;
4158: delete *mat;
4159: *mat = 0;
4160: }
4161: PetscFunctionReturn(PETSC_SUCCESS);
4162: }
4164: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4165: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4166: {
4167: PetscFunctionBegin;
4168: if (*trifactor) {
4169: if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4170: if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4171: PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4172: if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4173: if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4174: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4175: if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4176: #endif
4177: PetscCall(PetscFree(*trifactor));
4178: }
4179: PetscFunctionReturn(PETSC_SUCCESS);
4180: }
4181: #endif
4183: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4184: {
4185: CsrMatrix *mat;
4187: PetscFunctionBegin;
4188: if (*matstruct) {
4189: if ((*matstruct)->mat) {
4190: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4191: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4192: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4193: #else
4194: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4195: PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4196: #endif
4197: } else {
4198: mat = (CsrMatrix *)(*matstruct)->mat;
4199: PetscCall(CsrMatrix_Destroy(&mat));
4200: }
4201: }
4202: if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4203: delete (*matstruct)->cprowIndices;
4204: if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4205: if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4206: if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4208: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4210: if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4212: for (int i = 0; i < 3; i++) {
4213: if (mdata->cuSpMV[i].initialized) {
4214: PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4215: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4216: PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4217: #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4218: if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4219: if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4220: #endif
4221: }
4222: }
4223: #endif
4224: delete *matstruct;
4225: *matstruct = NULL;
4226: }
4227: PetscFunctionReturn(PETSC_SUCCESS);
4228: }
4230: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4231: {
4232: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4234: PetscFunctionBegin;
4235: if (fs) {
4236: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4237: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4238: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4239: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4240: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4241: delete fs->workVector;
4242: fs->workVector = NULL;
4243: #endif
4244: delete fs->rpermIndices;
4245: delete fs->cpermIndices;
4246: fs->rpermIndices = NULL;
4247: fs->cpermIndices = NULL;
4248: fs->init_dev_prop = PETSC_FALSE;
4249: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4250: PetscCallCUDA(cudaFree(fs->csrRowPtr));
4251: PetscCallCUDA(cudaFree(fs->csrColIdx));
4252: PetscCallCUDA(cudaFree(fs->csrRowPtr32));
4253: PetscCallCUDA(cudaFree(fs->csrColIdx32));
4254: PetscCallCUDA(cudaFree(fs->csrVal));
4255: PetscCallCUDA(cudaFree(fs->diag));
4256: PetscCallCUDA(cudaFree(fs->X));
4257: PetscCallCUDA(cudaFree(fs->Y));
4258: // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4259: PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4260: PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4261: PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4262: PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4263: PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4264: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4265: PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4266: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4267: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4268: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4269: PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4270: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4271: PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4272: PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4273: PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4274: PetscCall(PetscFree(fs->csrRowPtr_h));
4275: PetscCall(PetscFree(fs->csrVal_h));
4276: PetscCall(PetscFree(fs->diag_h));
4277: fs->createdTransposeSpSVDescr = PETSC_FALSE;
4278: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4279: #endif
4280: }
4281: PetscFunctionReturn(PETSC_SUCCESS);
4282: }
4284: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4285: {
4286: PetscFunctionBegin;
4287: if (*trifactors) {
4288: PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4289: PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
4290: PetscCall(PetscFree(*trifactors));
4291: }
4292: PetscFunctionReturn(PETSC_SUCCESS);
4293: }
4295: struct IJCompare {
4296: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4297: {
4298: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4299: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4300: return false;
4301: }
4302: };
4304: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4305: {
4306: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4308: PetscFunctionBegin;
4309: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4310: if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4311: if (destroy) {
4312: PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4313: delete cusp->csr2csc_i;
4314: cusp->csr2csc_i = NULL;
4315: }
4316: A->transupdated = PETSC_FALSE;
4317: PetscFunctionReturn(PETSC_SUCCESS);
4318: }
4320: static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4321: {
4322: MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
4324: PetscFunctionBegin;
4325: PetscCallCUDA(cudaFree(coo->perm));
4326: PetscCallCUDA(cudaFree(coo->jmap));
4327: PetscCall(PetscFree(coo));
4328: PetscFunctionReturn(PETSC_SUCCESS);
4329: }
4331: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4332: {
4333: PetscBool dev_ij = PETSC_FALSE;
4334: PetscMemType mtype = PETSC_MEMTYPE_HOST;
4335: PetscInt *i, *j;
4336: PetscContainer container_h;
4337: MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4339: PetscFunctionBegin;
4340: PetscCall(PetscGetMemType(coo_i, &mtype));
4341: if (PetscMemTypeDevice(mtype)) {
4342: dev_ij = PETSC_TRUE;
4343: PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
4344: PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4345: PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4346: } else {
4347: i = coo_i;
4348: j = coo_j;
4349: }
4351: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
4352: if (dev_ij) PetscCall(PetscFree2(i, j));
4353: mat->offloadmask = PETSC_OFFLOAD_CPU;
4354: // Create the GPU memory
4355: PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4357: // Copy the COO struct to device
4358: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
4359: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
4360: PetscCall(PetscMalloc1(1, &coo_d));
4361: *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
4362: PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
4363: PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
4364: PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
4365: PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4367: // Put the COO struct in a container and then attach that to the matrix
4368: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
4369: PetscFunctionReturn(PETSC_SUCCESS);
4370: }
4372: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4373: {
4374: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4375: const PetscCount grid_size = gridDim.x * blockDim.x;
4376: for (; i < nnz; i += grid_size) {
4377: PetscScalar sum = 0.0;
4378: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4379: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4380: }
4381: }
4383: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4384: {
4385: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4386: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4387: PetscCount Annz = seq->nz;
4388: PetscMemType memtype;
4389: const PetscScalar *v1 = v;
4390: PetscScalar *Aa;
4391: PetscContainer container;
4392: MatCOOStruct_SeqAIJ *coo;
4394: PetscFunctionBegin;
4395: if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4397: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
4398: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
4400: PetscCall(PetscGetMemType(v, &memtype));
4401: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4402: PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
4403: PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4404: }
4406: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
4407: else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4409: PetscCall(PetscLogGpuTimeBegin());
4410: if (Annz) {
4411: MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
4412: PetscCallCUDA(cudaPeekAtLastError());
4413: }
4414: PetscCall(PetscLogGpuTimeEnd());
4416: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
4417: else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4419: if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4420: PetscFunctionReturn(PETSC_SUCCESS);
4421: }
4423: /*@C
4424: MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
4426: Not Collective
4428: Input Parameters:
4429: + A - the matrix
4430: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4432: Output Parameters:
4433: + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4434: - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices`
4436: Level: developer
4438: Note:
4439: When compressed is true, the CSR structure does not contain empty rows
4441: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4442: @*/
4443: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4444: {
4445: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4446: CsrMatrix *csr;
4447: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4449: PetscFunctionBegin;
4451: if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
4452: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4453: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4454: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4455: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4456: csr = (CsrMatrix *)cusp->mat->mat;
4457: if (i) {
4458: if (!compressed && a->compressedrow.use) { /* need full row offset */
4459: if (!cusp->rowoffsets_gpu) {
4460: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4461: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4462: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4463: }
4464: *i = cusp->rowoffsets_gpu->data().get();
4465: } else *i = csr->row_offsets->data().get();
4466: }
4467: if (j) *j = csr->column_indices->data().get();
4468: PetscFunctionReturn(PETSC_SUCCESS);
4469: }
4471: /*@C
4472: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4474: Not Collective
4476: Input Parameters:
4477: + A - the matrix
4478: . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4479: . i - the CSR row pointers
4480: - j - the CSR column indices
4482: Level: developer
4484: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
4485: @*/
4486: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4487: {
4488: PetscFunctionBegin;
4490: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4491: if (i) *i = NULL;
4492: if (j) *j = NULL;
4493: (void)compressed;
4494: PetscFunctionReturn(PETSC_SUCCESS);
4495: }
4497: /*@C
4498: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored
4500: Not Collective
4502: Input Parameter:
4503: . A - a `MATSEQAIJCUSPARSE` matrix
4505: Output Parameter:
4506: . a - pointer to the device data
4508: Level: developer
4510: Note:
4511: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4513: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4514: @*/
4515: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4516: {
4517: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4518: CsrMatrix *csr;
4520: PetscFunctionBegin;
4522: PetscAssertPointer(a, 2);
4523: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4524: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4525: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4526: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4527: csr = (CsrMatrix *)cusp->mat->mat;
4528: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4529: *a = csr->values->data().get();
4530: PetscFunctionReturn(PETSC_SUCCESS);
4531: }
4533: /*@C
4534: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4536: Not Collective
4538: Input Parameters:
4539: + A - a `MATSEQAIJCUSPARSE` matrix
4540: - a - pointer to the device data
4542: Level: developer
4544: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
4545: @*/
4546: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4547: {
4548: PetscFunctionBegin;
4550: PetscAssertPointer(a, 2);
4551: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4552: *a = NULL;
4553: PetscFunctionReturn(PETSC_SUCCESS);
4554: }
4556: /*@C
4557: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4559: Not Collective
4561: Input Parameter:
4562: . A - a `MATSEQAIJCUSPARSE` matrix
4564: Output Parameter:
4565: . a - pointer to the device data
4567: Level: developer
4569: Note:
4570: Will trigger host-to-device copies if the most up-to-date matrix data is on the host
4572: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4573: @*/
4574: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4575: {
4576: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4577: CsrMatrix *csr;
4579: PetscFunctionBegin;
4581: PetscAssertPointer(a, 2);
4582: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4583: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4584: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4585: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4586: csr = (CsrMatrix *)cusp->mat->mat;
4587: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4588: *a = csr->values->data().get();
4589: A->offloadmask = PETSC_OFFLOAD_GPU;
4590: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4591: PetscFunctionReturn(PETSC_SUCCESS);
4592: }
4593: /*@C
4594: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4596: Not Collective
4598: Input Parameters:
4599: + A - a `MATSEQAIJCUSPARSE` matrix
4600: - a - pointer to the device data
4602: Level: developer
4604: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
4605: @*/
4606: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4607: {
4608: PetscFunctionBegin;
4610: PetscAssertPointer(a, 2);
4611: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4612: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4613: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4614: *a = NULL;
4615: PetscFunctionReturn(PETSC_SUCCESS);
4616: }
4618: /*@C
4619: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4621: Not Collective
4623: Input Parameter:
4624: . A - a `MATSEQAIJCUSPARSE` matrix
4626: Output Parameter:
4627: . a - pointer to the device data
4629: Level: developer
4631: Note:
4632: Does not trigger any host to device copies.
4634: It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current
4636: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4637: @*/
4638: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4639: {
4640: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4641: CsrMatrix *csr;
4643: PetscFunctionBegin;
4645: PetscAssertPointer(a, 2);
4646: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4647: PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4648: PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4649: csr = (CsrMatrix *)cusp->mat->mat;
4650: PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4651: *a = csr->values->data().get();
4652: A->offloadmask = PETSC_OFFLOAD_GPU;
4653: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4654: PetscFunctionReturn(PETSC_SUCCESS);
4655: }
4657: /*@C
4658: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4660: Not Collective
4662: Input Parameters:
4663: + A - a `MATSEQAIJCUSPARSE` matrix
4664: - a - pointer to the device data
4666: Level: developer
4668: .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
4669: @*/
4670: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4671: {
4672: PetscFunctionBegin;
4674: PetscAssertPointer(a, 2);
4675: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4676: PetscCall(MatSeqAIJInvalidateDiagonal(A));
4677: PetscCall(PetscObjectStateIncrease((PetscObject)A));
4678: *a = NULL;
4679: PetscFunctionReturn(PETSC_SUCCESS);
4680: }
4682: struct IJCompare4 {
4683: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4684: {
4685: if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
4686: if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4687: return false;
4688: }
4689: };
4691: struct Shift {
4692: int _shift;
4694: Shift(int shift) : _shift(shift) { }
4695: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4696: };
4698: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4699: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4700: {
4701: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4702: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4703: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4704: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4705: PetscInt Annz, Bnnz;
4706: cusparseStatus_t stat;
4707: PetscInt i, m, n, zero = 0;
4709: PetscFunctionBegin;
4712: PetscAssertPointer(C, 4);
4713: PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4714: PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
4715: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
4716: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4717: PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4718: PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4719: if (reuse == MAT_INITIAL_MATRIX) {
4720: m = A->rmap->n;
4721: n = A->cmap->n + B->cmap->n;
4722: PetscCall(MatCreate(PETSC_COMM_SELF, C));
4723: PetscCall(MatSetSizes(*C, m, n, m, n));
4724: PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4725: c = (Mat_SeqAIJ *)(*C)->data;
4726: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4727: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4728: Ccsr = new CsrMatrix;
4729: Cmat->cprowIndices = NULL;
4730: c->compressedrow.use = PETSC_FALSE;
4731: c->compressedrow.nrows = 0;
4732: c->compressedrow.i = NULL;
4733: c->compressedrow.rindex = NULL;
4734: Ccusp->workVector = NULL;
4735: Ccusp->nrows = m;
4736: Ccusp->mat = Cmat;
4737: Ccusp->mat->mat = Ccsr;
4738: Ccsr->num_rows = m;
4739: Ccsr->num_cols = n;
4740: PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4741: PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4742: PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4743: PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4744: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4745: PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
4746: PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4747: PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4748: PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4749: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4750: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4751: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4752: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4754: Acsr = (CsrMatrix *)Acusp->mat->mat;
4755: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4756: Annz = (PetscInt)Acsr->column_indices->size();
4757: Bnnz = (PetscInt)Bcsr->column_indices->size();
4758: c->nz = Annz + Bnnz;
4759: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4760: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4761: Ccsr->values = new THRUSTARRAY(c->nz);
4762: Ccsr->num_entries = c->nz;
4763: Ccusp->coords = new THRUSTINTARRAY(c->nz);
4764: if (c->nz) {
4765: auto Acoo = new THRUSTINTARRAY32(Annz);
4766: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4767: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4768: THRUSTINTARRAY32 *Aroff, *Broff;
4770: if (a->compressedrow.use) { /* need full row offset */
4771: if (!Acusp->rowoffsets_gpu) {
4772: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4773: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4774: PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4775: }
4776: Aroff = Acusp->rowoffsets_gpu;
4777: } else Aroff = Acsr->row_offsets;
4778: if (b->compressedrow.use) { /* need full row offset */
4779: if (!Bcusp->rowoffsets_gpu) {
4780: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4781: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4782: PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4783: }
4784: Broff = Bcusp->rowoffsets_gpu;
4785: } else Broff = Bcsr->row_offsets;
4786: PetscCall(PetscLogGpuTimeBegin());
4787: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4788: PetscCallCUSPARSE(stat);
4789: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4790: PetscCallCUSPARSE(stat);
4791: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4792: auto Aperm = thrust::make_constant_iterator(1);
4793: auto Bperm = thrust::make_constant_iterator(0);
4794: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4795: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4796: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4797: #else
4798: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4799: auto Bcib = Bcsr->column_indices->begin();
4800: auto Bcie = Bcsr->column_indices->end();
4801: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4802: #endif
4803: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4804: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4805: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4806: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4807: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4808: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4809: auto p1 = Ccusp->coords->begin();
4810: auto p2 = Ccusp->coords->begin();
4811: #if CCCL_VERSION >= 3001000
4812: cuda::std::advance(p2, Annz);
4813: #else
4814: thrust::advance(p2, Annz);
4815: #endif
4816: PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
4817: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4818: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4819: #endif
4820: auto cci = thrust::make_counting_iterator(zero);
4821: auto cce = thrust::make_counting_iterator(c->nz);
4822: #if 0 //Errors on SUMMIT cuda 11.1.0
4823: PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4824: #else
4825: #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST)
4826: auto pred = thrust::identity<int>();
4827: #else
4828: auto pred = cuda::std::identity();
4829: #endif
4830: PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4831: PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
4832: #endif
4833: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4834: PetscCallCUSPARSE(stat);
4835: PetscCall(PetscLogGpuTimeEnd());
4836: delete wPerm;
4837: delete Acoo;
4838: delete Bcoo;
4839: delete Ccoo;
4840: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4841: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4842: PetscCallCUSPARSE(stat);
4843: #endif
4844: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4845: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4846: PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4847: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4848: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4849: CsrMatrix *CcsrT = new CsrMatrix;
4850: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4851: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4853: (*C)->form_explicit_transpose = PETSC_TRUE;
4854: (*C)->transupdated = PETSC_TRUE;
4855: Ccusp->rowoffsets_gpu = NULL;
4856: CmatT->cprowIndices = NULL;
4857: CmatT->mat = CcsrT;
4858: CcsrT->num_rows = n;
4859: CcsrT->num_cols = m;
4860: CcsrT->num_entries = c->nz;
4862: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4863: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4864: CcsrT->values = new THRUSTARRAY(c->nz);
4866: PetscCall(PetscLogGpuTimeBegin());
4867: auto rT = CcsrT->row_offsets->begin();
4868: if (AT) {
4869: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4870: #if CCCL_VERSION >= 3001000
4871: cuda::std::advance(rT, -1);
4872: #else
4873: thrust::advance(rT, -1);
4874: #endif
4875: }
4876: if (BT) {
4877: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4878: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4879: thrust::copy(titb, tite, rT);
4880: }
4881: auto cT = CcsrT->column_indices->begin();
4882: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4883: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4884: auto vT = CcsrT->values->begin();
4885: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4886: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4887: PetscCall(PetscLogGpuTimeEnd());
4889: PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4890: PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4891: PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4892: PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4893: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4894: PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
4895: PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4896: PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4897: PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4898: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4899: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4900: PetscCallCUSPARSE(stat);
4901: #endif
4902: Ccusp->matTranspose = CmatT;
4903: }
4904: }
4906: c->free_a = PETSC_TRUE;
4907: PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
4908: PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4909: c->free_ij = PETSC_TRUE;
4910: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4911: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4912: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4913: ii = *Ccsr->row_offsets;
4914: jj = *Ccsr->column_indices;
4915: PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4916: PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4917: } else {
4918: PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4919: PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920: }
4921: PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
4922: PetscCall(PetscMalloc1(m, &c->ilen));
4923: PetscCall(PetscMalloc1(m, &c->imax));
4924: c->maxnz = c->nz;
4925: c->nonzerorowcnt = 0;
4926: c->rmax = 0;
4927: for (i = 0; i < m; i++) {
4928: const PetscInt nn = c->i[i + 1] - c->i[i];
4929: c->ilen[i] = c->imax[i] = nn;
4930: c->nonzerorowcnt += (PetscInt)!!nn;
4931: c->rmax = PetscMax(c->rmax, nn);
4932: }
4933: PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4934: PetscCall(PetscMalloc1(c->nz, &c->a));
4935: (*C)->nonzerostate++;
4936: PetscCall(PetscLayoutSetUp((*C)->rmap));
4937: PetscCall(PetscLayoutSetUp((*C)->cmap));
4938: Ccusp->nonzerostate = (*C)->nonzerostate;
4939: (*C)->preallocated = PETSC_TRUE;
4940: } else {
4941: PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4942: c = (Mat_SeqAIJ *)(*C)->data;
4943: if (c->nz) {
4944: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4945: PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4946: PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4947: PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
4948: PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4949: PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4950: PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4951: PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4952: Acsr = (CsrMatrix *)Acusp->mat->mat;
4953: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4954: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4955: PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4956: PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4957: PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4958: PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
4959: PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
4960: auto pmid = Ccusp->coords->begin();
4961: #if CCCL_VERSION >= 3001000
4962: cuda::std::advance(pmid, Acsr->num_entries);
4963: #else
4964: thrust::advance(pmid, Acsr->num_entries);
4965: #endif
4966: PetscCall(PetscLogGpuTimeBegin());
4967: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
4968: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4969: thrust::for_each(zibait, zieait, VecCUDAEquals());
4970: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4971: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4972: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4973: PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
4974: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4975: PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4976: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4977: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4978: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4979: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4980: auto vT = CcsrT->values->begin();
4981: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4982: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4983: (*C)->transupdated = PETSC_TRUE;
4984: }
4985: PetscCall(PetscLogGpuTimeEnd());
4986: }
4987: }
4988: PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4989: (*C)->assembled = PETSC_TRUE;
4990: (*C)->was_assembled = PETSC_FALSE;
4991: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4992: PetscFunctionReturn(PETSC_SUCCESS);
4993: }
4995: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4996: {
4997: bool dmem;
4998: const PetscScalar *av;
5000: PetscFunctionBegin;
5001: dmem = isCudaMem(v);
5002: PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5003: if (n && idx) {
5004: THRUSTINTARRAY widx(n);
5005: widx.assign(idx, idx + n);
5006: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5008: THRUSTARRAY *w = NULL;
5009: thrust::device_ptr<PetscScalar> dv;
5010: if (dmem) {
5011: dv = thrust::device_pointer_cast(v);
5012: } else {
5013: w = new THRUSTARRAY(n);
5014: dv = w->data();
5015: }
5016: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5018: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5019: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5020: thrust::for_each(zibit, zieit, VecCUDAEquals());
5021: if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5022: delete w;
5023: } else {
5024: PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5025: }
5026: if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
5027: PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5028: PetscFunctionReturn(PETSC_SUCCESS);
5029: }