Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <petscconf.h>
9: #include <../src/mat/impls/aij/seq/aij.h>
10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: #include <thrust/adjacent_difference.h>
16: #if PETSC_CPP_VERSION >= 14
17: #define PETSC_HAVE_THRUST_ASYNC 1
18: // thrust::for_each(thrust::cuda::par.on()) requires C++14
19: #include <thrust/async/for_each.h>
20: #endif
21: #include <thrust/iterator/constant_iterator.h>
22: #include <thrust/remove.h>
23: #include <thrust/sort.h>
24: #include <thrust/unique.h>
26: const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
31: typedef enum {
32: CUSPARSE_MV_ALG_DEFAULT = 0,
33: CUSPARSE_COOMV_ALG = 1,
34: CUSPARSE_CSRMV_ALG1 = 2,
35: CUSPARSE_CSRMV_ALG2 = 3
36: } cusparseSpMVAlg_t;
38: typedef enum {
39: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
41: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
42: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
43: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
44: CUSPARSE_SPMM_ALG_DEFAULT = 0,
45: CUSPARSE_SPMM_COO_ALG1 = 1,
46: CUSPARSE_SPMM_COO_ALG2 = 2,
47: CUSPARSE_SPMM_COO_ALG3 = 3,
48: CUSPARSE_SPMM_COO_ALG4 = 5,
49: CUSPARSE_SPMM_CSR_ALG1 = 4,
50: CUSPARSE_SPMM_CSR_ALG2 = 6,
51: } cusparseSpMMAlg_t;
53: typedef enum {
54: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic
56: } cusparseCsr2CscAlg_t;
57: */
58: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61: #endif
63: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
67: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
68: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
69: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
71: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
72: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
73: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
74: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
77: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
78: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
79: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
86: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
92: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
95: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96: static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97: static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
99: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100: {
101: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
103: switch (op) {
104: case MAT_CUSPARSE_MULT:
105: cusparsestruct->format = format;
106: break;
107: case MAT_CUSPARSE_ALL:
108: cusparsestruct->format = format;
109: break;
110: default:
111: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112: }
113: return 0;
114: }
116: /*@
117: MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
118: operation. Only the `MatMult()` operation can use different GPU storage formats
120: Not Collective
122: Input Parameters:
123: + A - Matrix of type `MATSEQAIJCUSPARSE`
124: . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
125: `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
126: - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128: Output Parameter:
130: Level: intermediate
132: .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133: @*/
134: PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
135: {
137: PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
138: return 0;
139: }
141: PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
142: {
143: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
145: cusparsestruct->use_cpu_solve = use_cpu;
146: return 0;
147: }
149: /*@
150: MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
152: Input Parameters:
153: + A - Matrix of type `MATSEQAIJCUSPARSE`
154: - use_cpu - set flag for using the built-in CPU `MatSolve()`
156: Output Parameter:
158: Note:
159: The cuSparse LU solver currently computes the factors with the built-in CPU method
160: and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161: This method to specify if the solve is done on the CPU or GPU (GPU is the default).
163: Level: intermediate
165: .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
166: @*/
167: PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
168: {
170: PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
171: return 0;
172: }
174: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
175: {
176: switch (op) {
177: case MAT_FORM_EXPLICIT_TRANSPOSE:
178: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179: if (A->form_explicit_transpose && !flg) MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
180: A->form_explicit_transpose = flg;
181: break;
182: default:
183: MatSetOption_SeqAIJ(A, op, flg);
184: break;
185: }
186: return 0;
187: }
189: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
191: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192: {
193: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
194: IS isrow = b->row, iscol = b->col;
195: PetscBool row_identity, col_identity;
196: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
198: MatSeqAIJCUSPARSECopyFromGPU(A);
199: MatLUFactorNumeric_SeqAIJ(B, A, info);
200: B->offloadmask = PETSC_OFFLOAD_CPU;
201: /* determine which version of MatSolve needs to be used. */
202: ISIdentity(isrow, &row_identity);
203: ISIdentity(iscol, &col_identity);
205: if (!cusparsestruct->use_cpu_solve) {
206: if (row_identity && col_identity) {
207: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
208: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
209: } else {
210: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
211: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
212: }
213: }
214: B->ops->matsolve = NULL;
215: B->ops->matsolvetranspose = NULL;
217: /* get the triangular factors */
218: if (!cusparsestruct->use_cpu_solve) MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
219: return 0;
220: }
222: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
223: {
224: MatCUSPARSEStorageFormat format;
225: PetscBool flg;
226: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
228: PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
229: if (A->factortype == MAT_FACTOR_NONE) {
230: PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg);
231: if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format);
233: PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg);
234: if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format);
235: PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg);
236: if (flg) MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve);
237: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
238: PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg);
239: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
240: #if CUSPARSE_VERSION > 11301
242: #else
244: #endif
245: PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg);
248: PetscCall(
249: PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
251: #endif
252: }
253: PetscOptionsHeadEnd();
254: return 0;
255: }
257: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
258: {
259: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
260: PetscInt n = A->rmap->n;
261: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
262: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
263: const PetscInt *ai = a->i, *aj = a->j, *vi;
264: const MatScalar *aa = a->a, *v;
265: PetscInt *AiLo, *AjLo;
266: PetscInt i, nz, nzLower, offset, rowOffset;
268: if (!n) return 0;
269: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
270: try {
271: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272: nzLower = n + ai[n] - ai[1];
273: if (!loTriFactor) {
274: PetscScalar *AALo;
276: cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar));
278: /* Allocate Space for the lower triangular matrix */
279: cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt));
280: cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt));
282: /* Fill the lower triangular matrix */
283: AiLo[0] = (PetscInt)0;
284: AiLo[n] = nzLower;
285: AjLo[0] = (PetscInt)0;
286: AALo[0] = (MatScalar)1.0;
287: v = aa;
288: vi = aj;
289: offset = 1;
290: rowOffset = 1;
291: for (i = 1; i < n; i++) {
292: nz = ai[i + 1] - ai[i];
293: /* additional 1 for the term on the diagonal */
294: AiLo[i] = rowOffset;
295: rowOffset += nz + 1;
297: PetscArraycpy(&(AjLo[offset]), vi, nz);
298: PetscArraycpy(&(AALo[offset]), v, nz);
300: offset += nz;
301: AjLo[offset] = (PetscInt)i;
302: AALo[offset] = (MatScalar)1.0;
303: offset += 1;
305: v += nz;
306: vi += nz;
307: }
309: /* allocate space for the triangular factor information */
310: PetscNew(&loTriFactor);
311: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
312: /* Create the matrix description */
313: cusparseCreateMatDescr(&loTriFactor->descr);
314: cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
315: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
316: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
317: #else
318: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
319: #endif
320: cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);
321: cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);
323: /* set the operation */
324: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
326: /* set the matrix */
327: loTriFactor->csrMat = new CsrMatrix;
328: loTriFactor->csrMat->num_rows = n;
329: loTriFactor->csrMat->num_cols = n;
330: loTriFactor->csrMat->num_entries = nzLower;
332: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
333: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
335: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
336: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
338: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
339: loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
341: /* Create the solve analysis information */
342: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
343: cusparseCreateCsrsvInfo(&loTriFactor->solveInfo);
344: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
345: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
346: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
347: cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
348: #endif
350: /* perform the solve analysis */
351: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
352: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
353: WaitForCUDA();
354: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
356: /* assign the pointer */
357: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
358: loTriFactor->AA_h = AALo;
359: cudaFreeHost(AiLo);
360: cudaFreeHost(AjLo);
361: PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar));
362: } else { /* update values only */
363: if (!loTriFactor->AA_h) cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar));
364: /* Fill the lower triangular matrix */
365: loTriFactor->AA_h[0] = 1.0;
366: v = aa;
367: vi = aj;
368: offset = 1;
369: for (i = 1; i < n; i++) {
370: nz = ai[i + 1] - ai[i];
371: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
372: offset += nz;
373: loTriFactor->AA_h[offset] = 1.0;
374: offset += 1;
375: v += nz;
376: }
377: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
378: PetscLogCpuToGpu(nzLower * sizeof(PetscScalar));
379: }
380: } catch (char *ex) {
381: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
382: }
383: }
384: return 0;
385: }
387: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
388: {
389: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
390: PetscInt n = A->rmap->n;
391: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
392: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
393: const PetscInt *aj = a->j, *adiag = a->diag, *vi;
394: const MatScalar *aa = a->a, *v;
395: PetscInt *AiUp, *AjUp;
396: PetscInt i, nz, nzUpper, offset;
398: if (!n) return 0;
399: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
400: try {
401: /* next, figure out the number of nonzeros in the upper triangular matrix. */
402: nzUpper = adiag[0] - adiag[n];
403: if (!upTriFactor) {
404: PetscScalar *AAUp;
406: cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar));
408: /* Allocate Space for the upper triangular matrix */
409: cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt));
410: cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt));
412: /* Fill the upper triangular matrix */
413: AiUp[0] = (PetscInt)0;
414: AiUp[n] = nzUpper;
415: offset = nzUpper;
416: for (i = n - 1; i >= 0; i--) {
417: v = aa + adiag[i + 1] + 1;
418: vi = aj + adiag[i + 1] + 1;
420: /* number of elements NOT on the diagonal */
421: nz = adiag[i] - adiag[i + 1] - 1;
423: /* decrement the offset */
424: offset -= (nz + 1);
426: /* first, set the diagonal elements */
427: AjUp[offset] = (PetscInt)i;
428: AAUp[offset] = (MatScalar)1. / v[nz];
429: AiUp[i] = AiUp[i + 1] - (nz + 1);
431: PetscArraycpy(&(AjUp[offset + 1]), vi, nz);
432: PetscArraycpy(&(AAUp[offset + 1]), v, nz);
433: }
435: /* allocate space for the triangular factor information */
436: PetscNew(&upTriFactor);
437: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
439: /* Create the matrix description */
440: cusparseCreateMatDescr(&upTriFactor->descr);
441: cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
442: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
443: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
444: #else
445: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
446: #endif
447: cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
448: cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
450: /* set the operation */
451: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
453: /* set the matrix */
454: upTriFactor->csrMat = new CsrMatrix;
455: upTriFactor->csrMat->num_rows = n;
456: upTriFactor->csrMat->num_cols = n;
457: upTriFactor->csrMat->num_entries = nzUpper;
459: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
460: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
462: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
463: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
465: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
466: upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
468: /* Create the solve analysis information */
469: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
470: cusparseCreateCsrsvInfo(&upTriFactor->solveInfo);
471: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
472: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
473: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
474: cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
475: #endif
477: /* perform the solve analysis */
478: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
479: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
481: WaitForCUDA();
482: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
484: /* assign the pointer */
485: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
486: upTriFactor->AA_h = AAUp;
487: cudaFreeHost(AiUp);
488: cudaFreeHost(AjUp);
489: PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar));
490: } else {
491: if (!upTriFactor->AA_h) cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar));
492: /* Fill the upper triangular matrix */
493: offset = nzUpper;
494: for (i = n - 1; i >= 0; i--) {
495: v = aa + adiag[i + 1] + 1;
497: /* number of elements NOT on the diagonal */
498: nz = adiag[i] - adiag[i + 1] - 1;
500: /* decrement the offset */
501: offset -= (nz + 1);
503: /* first, set the diagonal elements */
504: upTriFactor->AA_h[offset] = 1. / v[nz];
505: PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz);
506: }
507: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
508: PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar));
509: }
510: } catch (char *ex) {
511: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
512: }
513: }
514: return 0;
515: }
517: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
518: {
519: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
520: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
521: IS isrow = a->row, iscol = a->icol;
522: PetscBool row_identity, col_identity;
523: PetscInt n = A->rmap->n;
526: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
527: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
529: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
530: cusparseTriFactors->nnz = a->nz;
532: A->offloadmask = PETSC_OFFLOAD_BOTH;
533: /* lower triangular indices */
534: ISIdentity(isrow, &row_identity);
535: if (!row_identity && !cusparseTriFactors->rpermIndices) {
536: const PetscInt *r;
538: ISGetIndices(isrow, &r);
539: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
540: cusparseTriFactors->rpermIndices->assign(r, r + n);
541: ISRestoreIndices(isrow, &r);
542: PetscLogCpuToGpu(n * sizeof(PetscInt));
543: }
545: /* upper triangular indices */
546: ISIdentity(iscol, &col_identity);
547: if (!col_identity && !cusparseTriFactors->cpermIndices) {
548: const PetscInt *c;
550: ISGetIndices(iscol, &c);
551: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
552: cusparseTriFactors->cpermIndices->assign(c, c + n);
553: ISRestoreIndices(iscol, &c);
554: PetscLogCpuToGpu(n * sizeof(PetscInt));
555: }
556: return 0;
557: }
559: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
560: {
561: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
562: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
563: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
564: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
565: PetscInt *AiUp, *AjUp;
566: PetscScalar *AAUp;
567: PetscScalar *AALo;
568: PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
569: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data;
570: const PetscInt *ai = b->i, *aj = b->j, *vj;
571: const MatScalar *aa = b->a, *v;
573: if (!n) return 0;
574: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
575: try {
576: cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar));
577: cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar));
578: if (!upTriFactor && !loTriFactor) {
579: /* Allocate Space for the upper triangular matrix */
580: cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt));
581: cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt));
583: /* Fill the upper triangular matrix */
584: AiUp[0] = (PetscInt)0;
585: AiUp[n] = nzUpper;
586: offset = 0;
587: for (i = 0; i < n; i++) {
588: /* set the pointers */
589: v = aa + ai[i];
590: vj = aj + ai[i];
591: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
593: /* first, set the diagonal elements */
594: AjUp[offset] = (PetscInt)i;
595: AAUp[offset] = (MatScalar)1.0 / v[nz];
596: AiUp[i] = offset;
597: AALo[offset] = (MatScalar)1.0 / v[nz];
599: offset += 1;
600: if (nz > 0) {
601: PetscArraycpy(&(AjUp[offset]), vj, nz);
602: PetscArraycpy(&(AAUp[offset]), v, nz);
603: for (j = offset; j < offset + nz; j++) {
604: AAUp[j] = -AAUp[j];
605: AALo[j] = AAUp[j] / v[nz];
606: }
607: offset += nz;
608: }
609: }
611: /* allocate space for the triangular factor information */
612: PetscNew(&upTriFactor);
613: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
615: /* Create the matrix description */
616: cusparseCreateMatDescr(&upTriFactor->descr);
617: cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
618: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
619: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
620: #else
621: cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
622: #endif
623: cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
624: cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);
626: /* set the matrix */
627: upTriFactor->csrMat = new CsrMatrix;
628: upTriFactor->csrMat->num_rows = A->rmap->n;
629: upTriFactor->csrMat->num_cols = A->cmap->n;
630: upTriFactor->csrMat->num_entries = a->nz;
632: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
633: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
635: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
636: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
638: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
639: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
641: /* set the operation */
642: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
644: /* Create the solve analysis information */
645: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
646: cusparseCreateCsrsvInfo(&upTriFactor->solveInfo);
647: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
648: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
649: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
650: cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize);
651: #endif
653: /* perform the solve analysis */
654: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
655: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
657: WaitForCUDA();
658: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
660: /* assign the pointer */
661: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
663: /* allocate space for the triangular factor information */
664: PetscNew(&loTriFactor);
665: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
667: /* Create the matrix description */
668: cusparseCreateMatDescr(&loTriFactor->descr);
669: cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);
670: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
671: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
672: #else
673: cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);
674: #endif
675: cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);
676: cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
678: /* set the operation */
679: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
681: /* set the matrix */
682: loTriFactor->csrMat = new CsrMatrix;
683: loTriFactor->csrMat->num_rows = A->rmap->n;
684: loTriFactor->csrMat->num_cols = A->cmap->n;
685: loTriFactor->csrMat->num_entries = a->nz;
687: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
688: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
690: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
693: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
696: /* Create the solve analysis information */
697: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
698: cusparseCreateCsrsvInfo(&loTriFactor->solveInfo);
699: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
700: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
701: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
702: cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize);
703: #endif
705: /* perform the solve analysis */
706: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
707: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
709: WaitForCUDA();
710: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
712: /* assign the pointer */
713: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
715: PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)));
716: cudaFreeHost(AiUp);
717: cudaFreeHost(AjUp);
718: } else {
719: /* Fill the upper triangular matrix */
720: offset = 0;
721: for (i = 0; i < n; i++) {
722: /* set the pointers */
723: v = aa + ai[i];
724: nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
726: /* first, set the diagonal elements */
727: AAUp[offset] = 1.0 / v[nz];
728: AALo[offset] = 1.0 / v[nz];
730: offset += 1;
731: if (nz > 0) {
732: PetscArraycpy(&(AAUp[offset]), v, nz);
733: for (j = offset; j < offset + nz; j++) {
734: AAUp[j] = -AAUp[j];
735: AALo[j] = AAUp[j] / v[nz];
736: }
737: offset += nz;
738: }
739: }
742: upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
743: loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
744: PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar));
745: }
746: cudaFreeHost(AAUp);
747: cudaFreeHost(AALo);
748: } catch (char *ex) {
749: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
750: }
751: }
752: return 0;
753: }
755: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
756: {
757: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
758: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
759: IS ip = a->row;
760: PetscBool perm_identity;
761: PetscInt n = A->rmap->n;
764: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
765: if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
766: cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
768: A->offloadmask = PETSC_OFFLOAD_BOTH;
770: /* lower triangular indices */
771: ISIdentity(ip, &perm_identity);
772: if (!perm_identity) {
773: IS iip;
774: const PetscInt *irip, *rip;
776: ISInvertPermutation(ip, PETSC_DECIDE, &iip);
777: ISGetIndices(iip, &irip);
778: ISGetIndices(ip, &rip);
779: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
780: cusparseTriFactors->rpermIndices->assign(rip, rip + n);
781: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
782: cusparseTriFactors->cpermIndices->assign(irip, irip + n);
783: ISRestoreIndices(iip, &irip);
784: ISDestroy(&iip);
785: ISRestoreIndices(ip, &rip);
786: PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
787: }
788: return 0;
789: }
791: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
792: {
793: Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
794: IS ip = b->row;
795: PetscBool perm_identity;
797: MatSeqAIJCUSPARSECopyFromGPU(A);
798: MatCholeskyFactorNumeric_SeqAIJ(B, A, info);
799: B->offloadmask = PETSC_OFFLOAD_CPU;
800: /* determine which version of MatSolve needs to be used. */
801: ISIdentity(ip, &perm_identity);
802: if (perm_identity) {
803: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
804: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
805: B->ops->matsolve = NULL;
806: B->ops->matsolvetranspose = NULL;
807: } else {
808: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
809: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
810: B->ops->matsolve = NULL;
811: B->ops->matsolvetranspose = NULL;
812: }
814: /* get the triangular factors */
815: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
816: return 0;
817: }
819: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
820: {
821: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
822: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
823: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
824: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
825: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
826: cusparseIndexBase_t indexBase;
827: cusparseMatrixType_t matrixType;
828: cusparseFillMode_t fillMode;
829: cusparseDiagType_t diagType;
831: /* allocate space for the transpose of the lower triangular factor */
832: PetscNew(&loTriFactorT);
833: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
835: /* set the matrix descriptors of the lower triangular factor */
836: matrixType = cusparseGetMatType(loTriFactor->descr);
837: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
838: fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
839: diagType = cusparseGetMatDiagType(loTriFactor->descr);
841: /* Create the matrix description */
842: cusparseCreateMatDescr(&loTriFactorT->descr);
843: cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);
844: cusparseSetMatType(loTriFactorT->descr, matrixType);
845: cusparseSetMatFillMode(loTriFactorT->descr, fillMode);
846: cusparseSetMatDiagType(loTriFactorT->descr, diagType);
848: /* set the operation */
849: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
851: /* allocate GPU space for the CSC of the lower triangular factor*/
852: loTriFactorT->csrMat = new CsrMatrix;
853: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
854: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
855: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
856: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
857: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
858: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
860: /* compute the transpose of the lower triangular factor, i.e. the CSC */
861: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
862: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
863: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
864: loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
865: cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize);
866: #endif
868: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
869: {
870: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
871: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872: loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
873: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
874: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
875: #else
876: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
877: #endif
878: stat;
879: }
881: WaitForCUDA();
882: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
884: /* Create the solve analysis information */
885: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
886: cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo);
887: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
888: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
889: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
890: cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize);
891: #endif
893: /* perform the solve analysis */
894: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
895: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
897: WaitForCUDA();
898: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
900: /* assign the pointer */
901: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
903: /*********************************************/
904: /* Now the Transpose of the Upper Tri Factor */
905: /*********************************************/
907: /* allocate space for the transpose of the upper triangular factor */
908: PetscNew(&upTriFactorT);
909: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
911: /* set the matrix descriptors of the upper triangular factor */
912: matrixType = cusparseGetMatType(upTriFactor->descr);
913: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
914: fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915: diagType = cusparseGetMatDiagType(upTriFactor->descr);
917: /* Create the matrix description */
918: cusparseCreateMatDescr(&upTriFactorT->descr);
919: cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);
920: cusparseSetMatType(upTriFactorT->descr, matrixType);
921: cusparseSetMatFillMode(upTriFactorT->descr, fillMode);
922: cusparseSetMatDiagType(upTriFactorT->descr, diagType);
924: /* set the operation */
925: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
927: /* allocate GPU space for the CSC of the upper triangular factor*/
928: upTriFactorT->csrMat = new CsrMatrix;
929: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
930: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
931: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
932: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
933: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
934: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
936: /* compute the transpose of the upper triangular factor, i.e. the CSC */
937: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
938: PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
939: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
940: upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
941: cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize);
942: #endif
944: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
945: {
946: // there is no clean way to have PetscCallCUSPARSE wrapping this function...
947: auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
948: upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
949: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
950: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
951: #else
952: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
953: #endif
954: stat;
955: }
957: WaitForCUDA();
958: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
960: /* Create the solve analysis information */
961: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
962: cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo);
963: #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
964: PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
965: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
966: cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize);
967: #endif
969: /* perform the solve analysis */
970: /* christ, would it have killed you to put this stuff in a function????????? */
971: PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
972: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
974: WaitForCUDA();
975: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0);
977: /* assign the pointer */
978: ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
979: return 0;
980: }
982: struct PetscScalarToPetscInt {
983: __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
984: };
986: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
987: {
988: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
989: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
990: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
991: cusparseStatus_t stat;
992: cusparseIndexBase_t indexBase;
994: MatSeqAIJCUSPARSECopyToGPU(A);
995: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
997: matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
999: if (A->transupdated) return 0;
1000: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
1001: PetscLogGpuTimeBegin();
1002: if (cusparsestruct->format != MAT_CUSPARSE_CSR) MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
1003: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1004: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1005: cusparseCreateMatDescr(&matstructT->descr);
1006: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1007: cusparseSetMatIndexBase(matstructT->descr, indexBase);
1008: cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
1010: /* set alpha and beta */
1011: cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));
1012: cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar));
1013: cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));
1014: cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
1015: cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
1016: cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
1018: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1019: CsrMatrix *matrixT = new CsrMatrix;
1020: matstructT->mat = matrixT;
1021: matrixT->num_rows = A->cmap->n;
1022: matrixT->num_cols = A->rmap->n;
1023: matrixT->num_entries = a->nz;
1024: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1025: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1026: matrixT->values = new THRUSTARRAY(a->nz);
1028: if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1029: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1031: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1032: #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
1033: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1034: indexBase, cusparse_scalartype);
1035: stat;
1036: #else
1037: /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1038: see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1040: I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1041: it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1042: when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1043: */
1044: if (matrixT->num_entries) {
1045: stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
1046: stat;
1048: } else {
1049: matstructT->matDescr = NULL;
1050: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1051: }
1052: #endif
1053: #endif
1054: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1055: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1056: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1057: #else
1058: CsrMatrix *temp = new CsrMatrix;
1059: CsrMatrix *tempT = new CsrMatrix;
1060: /* First convert HYB to CSR */
1061: temp->num_rows = A->rmap->n;
1062: temp->num_cols = A->cmap->n;
1063: temp->num_entries = a->nz;
1064: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1065: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1066: temp->values = new THRUSTARRAY(a->nz);
1068: stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
1069: stat;
1071: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1072: tempT->num_rows = A->rmap->n;
1073: tempT->num_cols = A->cmap->n;
1074: tempT->num_entries = a->nz;
1075: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1076: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1077: tempT->values = new THRUSTARRAY(a->nz);
1079: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
1080: tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1081: stat;
1083: /* Last, convert CSC to HYB */
1084: cusparseHybMat_t hybMat;
1085: cusparseCreateHybMat(&hybMat);
1086: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1087: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
1088: stat;
1090: /* assign the pointer */
1091: matstructT->mat = hybMat;
1092: A->transupdated = PETSC_TRUE;
1093: /* delete temporaries */
1094: if (tempT) {
1095: if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1096: if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1097: if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1098: delete (CsrMatrix *)tempT;
1099: }
1100: if (temp) {
1101: if (temp->values) delete (THRUSTARRAY *)temp->values;
1102: if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1103: if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1104: delete (CsrMatrix *)temp;
1105: }
1106: #endif
1107: }
1108: }
1109: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1110: CsrMatrix *matrix = (CsrMatrix *)matstruct->mat;
1111: CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
1120: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1121: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1122: cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1123: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
1124: }
1125: if (!cusparsestruct->csr2csc_i) {
1126: THRUSTARRAY csr2csc_a(matrix->num_entries);
1127: thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0);
1129: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1130: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1131: void *csr2cscBuffer;
1132: size_t csr2cscBufferSize;
1133: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1134: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
1135: stat;
1136: cudaMalloc(&csr2cscBuffer, csr2cscBufferSize);
1137: #endif
1139: if (matrix->num_entries) {
1140: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1141: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1142: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1144: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1145: should be filled with indexBase. So I just take a shortcut here.
1146: */
1147: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1148: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1149: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
1150: stat;
1151: #else
1152: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1153: stat;
1154: #endif
1155: } else {
1156: matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
1157: }
1159: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1160: thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt());
1161: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1162: cudaFree(csr2cscBuffer);
1163: #endif
1164: }
1165: PetscCallThrust(
1166: thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1167: }
1168: PetscLogGpuTimeEnd();
1169: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0);
1170: /* the compressed row indices is not used for matTranspose */
1171: matstructT->cprowIndices = NULL;
1172: /* assign the pointer */
1173: ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
1174: A->transupdated = PETSC_TRUE;
1175: return 0;
1176: }
1178: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1179: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1180: {
1181: PetscInt n = xx->map->n;
1182: const PetscScalar *barray;
1183: PetscScalar *xarray;
1184: thrust::device_ptr<const PetscScalar> bGPU;
1185: thrust::device_ptr<PetscScalar> xGPU;
1186: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1187: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1188: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1189: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1191: /* Analyze the matrix and create the transpose ... on the fly */
1192: if (!loTriFactorT && !upTriFactorT) {
1193: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1194: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1195: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1196: }
1198: /* Get the GPU pointers */
1199: VecCUDAGetArrayWrite(xx, &xarray);
1200: VecCUDAGetArrayRead(bb, &barray);
1201: xGPU = thrust::device_pointer_cast(xarray);
1202: bGPU = thrust::device_pointer_cast(barray);
1204: PetscLogGpuTimeBegin();
1205: /* First, reorder with the row permutation */
1206: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1208: /* First, solve U */
1209: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1210: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1212: /* Then, solve L */
1213: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1214: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1216: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1217: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1219: /* Copy the temporary to the full solution. */
1220: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1222: /* restore */
1223: VecCUDARestoreArrayRead(bb, &barray);
1224: VecCUDARestoreArrayWrite(xx, &xarray);
1225: PetscLogGpuTimeEnd();
1226: PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1227: return 0;
1228: }
1230: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1231: {
1232: const PetscScalar *barray;
1233: PetscScalar *xarray;
1234: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1235: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1236: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1237: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1239: /* Analyze the matrix and create the transpose ... on the fly */
1240: if (!loTriFactorT && !upTriFactorT) {
1241: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1242: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1243: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1244: }
1246: /* Get the GPU pointers */
1247: VecCUDAGetArrayWrite(xx, &xarray);
1248: VecCUDAGetArrayRead(bb, &barray);
1250: PetscLogGpuTimeBegin();
1251: /* First, solve U */
1252: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
1253: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1255: /* Then, solve L */
1256: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
1257: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1259: /* restore */
1260: VecCUDARestoreArrayRead(bb, &barray);
1261: VecCUDARestoreArrayWrite(xx, &xarray);
1262: PetscLogGpuTimeEnd();
1263: PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1264: return 0;
1265: }
1267: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1268: {
1269: const PetscScalar *barray;
1270: PetscScalar *xarray;
1271: thrust::device_ptr<const PetscScalar> bGPU;
1272: thrust::device_ptr<PetscScalar> xGPU;
1273: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1274: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1275: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1276: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1278: /* Get the GPU pointers */
1279: VecCUDAGetArrayWrite(xx, &xarray);
1280: VecCUDAGetArrayRead(bb, &barray);
1281: xGPU = thrust::device_pointer_cast(xarray);
1282: bGPU = thrust::device_pointer_cast(barray);
1284: PetscLogGpuTimeBegin();
1285: /* First, reorder with the row permutation */
1286: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1288: /* Next, solve L */
1289: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1290: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1292: /* Then, solve U */
1293: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1294: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1296: /* Last, reorder with the column permutation */
1297: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
1299: VecCUDARestoreArrayRead(bb, &barray);
1300: VecCUDARestoreArrayWrite(xx, &xarray);
1301: PetscLogGpuTimeEnd();
1302: PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1303: return 0;
1304: }
1306: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1307: {
1308: const PetscScalar *barray;
1309: PetscScalar *xarray;
1310: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1311: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1312: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1313: THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector;
1315: /* Get the GPU pointers */
1316: VecCUDAGetArrayWrite(xx, &xarray);
1317: VecCUDAGetArrayRead(bb, &barray);
1319: PetscLogGpuTimeBegin();
1320: /* First, solve L */
1321: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
1322: loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1324: /* Next, solve U */
1325: PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
1326: upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1328: VecCUDARestoreArrayRead(bb, &barray);
1329: VecCUDARestoreArrayWrite(xx, &xarray);
1330: PetscLogGpuTimeEnd();
1331: PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n);
1332: return 0;
1333: }
1335: #if CUSPARSE_VERSION >= 11500
1336: /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1337: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1338: {
1339: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1340: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1341: const PetscScalar *barray;
1342: PetscScalar *xarray;
1344: VecCUDAGetArrayWrite(x, &xarray);
1345: VecCUDAGetArrayRead(b, &barray);
1346: PetscLogGpuTimeBegin();
1348: /* Solve L*y = b */
1349: cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1350: cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1351: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1352: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
1353: fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1355: /* Solve U*x = y */
1356: cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1357: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
1358: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1360: VecCUDARestoreArrayRead(b, &barray);
1361: VecCUDARestoreArrayWrite(x, &xarray);
1363: PetscLogGpuTimeEnd();
1364: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1365: return 0;
1366: }
1368: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1369: {
1370: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1371: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1372: const PetscScalar *barray;
1373: PetscScalar *xarray;
1375: if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1376: cusparseSpSV_createDescr(&fs->spsvDescr_Lt);
1377: PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1378: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1380: cusparseSpSV_createDescr(&fs->spsvDescr_Ut);
1381: cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut);
1382: cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1383: cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut);
1384: fs->createdTransposeSpSVDescr = PETSC_TRUE;
1385: }
1387: if (!fs->updatedTransposeSpSVAnalysis) {
1388: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);
1390: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut);
1391: fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1392: }
1394: VecCUDAGetArrayWrite(x, &xarray);
1395: VecCUDAGetArrayRead(b, &barray);
1396: PetscLogGpuTimeBegin();
1398: /* Solve Ut*y = b */
1399: cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1400: cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1401: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
1402: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1404: /* Solve Lt*x = y */
1405: cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1406: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1407: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1409: VecCUDARestoreArrayRead(b, &barray);
1410: VecCUDARestoreArrayWrite(x, &xarray);
1411: PetscLogGpuTimeEnd();
1412: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1413: return 0;
1414: }
1416: static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1417: {
1418: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1419: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1420: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1421: CsrMatrix *Acsr;
1422: PetscInt m, nz;
1423: PetscBool flg;
1425: if (PetscDefined(USE_DEBUG)) {
1426: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1428: }
1430: /* Copy A's value to fact */
1431: m = fact->rmap->n;
1432: nz = aij->nz;
1433: MatSeqAIJCUSPARSECopyToGPU(A);
1434: Acsr = (CsrMatrix *)Acusp->mat->mat;
1435: cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1437: /* Factorize fact inplace */
1438: if (m)
1439: PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1440: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1441: if (PetscDefined(USE_DEBUG)) {
1442: int numerical_zero;
1443: cusparseStatus_t status;
1444: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1445: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1446: }
1448: /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1449: See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1450: */
1451: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);
1453: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U);
1455: /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1456: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1458: fact->offloadmask = PETSC_OFFLOAD_GPU;
1459: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0;
1460: fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1461: fact->ops->matsolve = NULL;
1462: fact->ops->matsolvetranspose = NULL;
1463: PetscLogGpuFlops(fs->numericFactFlops);
1464: return 0;
1465: }
1467: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1468: {
1469: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1470: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1471: PetscInt m, nz;
1473: if (PetscDefined(USE_DEBUG)) {
1474: PetscInt i;
1475: PetscBool flg, missing;
1477: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1480: MatMissingDiagonal(A, &missing, &i);
1482: }
1484: /* Free the old stale stuff */
1485: MatSeqAIJCUSPARSETriFactors_Reset(&fs);
1487: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1488: but they will not be used. Allocate them just for easy debugging.
1489: */
1490: MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);
1492: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1493: fact->factortype = MAT_FACTOR_ILU;
1494: fact->info.factor_mallocs = 0;
1495: fact->info.fill_ratio_given = info->fill;
1496: fact->info.fill_ratio_needed = 1.0;
1498: aij->row = NULL;
1499: aij->col = NULL;
1501: /* ====================================================================== */
1502: /* Copy A's i, j to fact and also allocate the value array of fact. */
1503: /* We'll do in-place factorization on fact */
1504: /* ====================================================================== */
1505: const int *Ai, *Aj;
1507: m = fact->rmap->n;
1508: nz = aij->nz;
1510: cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1511: cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1512: cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1513: MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1514: cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1515: cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1517: /* ====================================================================== */
1518: /* Create descriptors for M, L, U */
1519: /* ====================================================================== */
1520: cusparseFillMode_t fillMode;
1521: cusparseDiagType_t diagType;
1523: cusparseCreateMatDescr(&fs->matDescr_M);
1524: cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO);
1525: cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
1527: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1528: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1529: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1530: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1531: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1532: */
1533: fillMode = CUSPARSE_FILL_MODE_LOWER;
1534: diagType = CUSPARSE_DIAG_TYPE_UNIT;
1535: cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1536: cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1537: cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1539: fillMode = CUSPARSE_FILL_MODE_UPPER;
1540: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1541: cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1542: cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1543: cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1545: /* ========================================================================= */
1546: /* Query buffer sizes for csrilu0, SpSV and allocate buffers */
1547: /* ========================================================================= */
1548: cusparseCreateCsrilu02Info(&fs->ilu0Info_M);
1549: if (m)
1550: PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1551: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1553: cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1554: cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);
1556: cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype);
1557: cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype);
1559: cusparseSpSV_createDescr(&fs->spsvDescr_L);
1560: cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);
1562: cusparseSpSV_createDescr(&fs->spsvDescr_U);
1563: cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U);
1565: /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1566: and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1567: spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1568: To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1569: */
1570: if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1571: cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1572: fs->spsvBuffer_L = fs->factBuffer_M;
1573: cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U);
1574: } else {
1575: cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M));
1576: fs->spsvBuffer_U = fs->factBuffer_M;
1577: cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1578: }
1580: /* ========================================================================== */
1581: /* Perform analysis of ilu0 on M, SpSv on L and U */
1582: /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1583: /* ========================================================================== */
1584: int structural_zero;
1585: cusparseStatus_t status;
1587: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1588: if (m)
1589: PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1590: fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1591: if (PetscDefined(USE_DEBUG)) {
1592: /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1593: status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1595: }
1597: /* Estimate FLOPs of the numeric factorization */
1598: {
1599: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1600: PetscInt *Ai, *Adiag, nzRow, nzLeft;
1601: PetscLogDouble flops = 0.0;
1603: MatMarkDiagonal_SeqAIJ(A);
1604: Ai = Aseq->i;
1605: Adiag = Aseq->diag;
1606: for (PetscInt i = 0; i < m; i++) {
1607: if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1608: nzRow = Ai[i + 1] - Ai[i];
1609: nzLeft = Adiag[i] - Ai[i];
1610: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1611: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1612: */
1613: nzLeft = (nzRow - 1) / 2;
1614: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1615: }
1616: }
1617: fs->numericFactFlops = flops;
1618: }
1619: fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1620: return 0;
1621: }
1623: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1624: {
1625: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1626: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1627: const PetscScalar *barray;
1628: PetscScalar *xarray;
1630: VecCUDAGetArrayWrite(x, &xarray);
1631: VecCUDAGetArrayRead(b, &barray);
1632: PetscLogGpuTimeBegin();
1634: /* Solve L*y = b */
1635: cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray);
1636: cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y);
1637: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
1638: fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1640: /* Solve Lt*x = y */
1641: cusparseDnVecSetValues(fs->dnVecDescr_X, xarray);
1642: PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
1643: fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1645: VecCUDARestoreArrayRead(b, &barray);
1646: VecCUDARestoreArrayWrite(x, &xarray);
1648: PetscLogGpuTimeEnd();
1649: PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n);
1650: return 0;
1651: }
1653: static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1654: {
1655: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1656: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1657: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1658: CsrMatrix *Acsr;
1659: PetscInt m, nz;
1660: PetscBool flg;
1662: if (PetscDefined(USE_DEBUG)) {
1663: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1665: }
1667: /* Copy A's value to fact */
1668: m = fact->rmap->n;
1669: nz = aij->nz;
1670: MatSeqAIJCUSPARSECopyToGPU(A);
1671: Acsr = (CsrMatrix *)Acusp->mat->mat;
1672: cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1674: /* Factorize fact inplace */
1675: /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1676: Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1677: The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1678: and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1679: In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1680: */
1681: if (m) cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1682: if (PetscDefined(USE_DEBUG)) {
1683: int numerical_zero;
1684: cusparseStatus_t status;
1685: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1686: PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1687: }
1689: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L);
1691: /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1692: ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1693: */
1694: cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt);
1696: fact->offloadmask = PETSC_OFFLOAD_GPU;
1697: fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0;
1698: fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0;
1699: fact->ops->matsolve = NULL;
1700: fact->ops->matsolvetranspose = NULL;
1701: PetscLogGpuFlops(fs->numericFactFlops);
1702: return 0;
1703: }
1705: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1706: {
1707: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1708: Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data;
1709: PetscInt m, nz;
1711: if (PetscDefined(USE_DEBUG)) {
1712: PetscInt i;
1713: PetscBool flg, missing;
1715: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
1718: MatMissingDiagonal(A, &missing, &i);
1720: }
1722: /* Free the old stale stuff */
1723: MatSeqAIJCUSPARSETriFactors_Reset(&fs);
1725: /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1726: but they will not be used. Allocate them just for easy debugging.
1727: */
1728: MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/);
1730: fact->offloadmask = PETSC_OFFLOAD_BOTH;
1731: fact->factortype = MAT_FACTOR_ICC;
1732: fact->info.factor_mallocs = 0;
1733: fact->info.fill_ratio_given = info->fill;
1734: fact->info.fill_ratio_needed = 1.0;
1736: aij->row = NULL;
1737: aij->col = NULL;
1739: /* ====================================================================== */
1740: /* Copy A's i, j to fact and also allocate the value array of fact. */
1741: /* We'll do in-place factorization on fact */
1742: /* ====================================================================== */
1743: const int *Ai, *Aj;
1745: m = fact->rmap->n;
1746: nz = aij->nz;
1748: cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1));
1749: cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz);
1750: cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz);
1751: MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj); /* Do not use compressed Ai */
1752: cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1753: cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream);
1755: /* ====================================================================== */
1756: /* Create mat descriptors for M, L */
1757: /* ====================================================================== */
1758: cusparseFillMode_t fillMode;
1759: cusparseDiagType_t diagType;
1761: cusparseCreateMatDescr(&fs->matDescr_M);
1762: cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO);
1763: cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
1765: /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1766: cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1767: assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1768: all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1769: assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1770: */
1771: fillMode = CUSPARSE_FILL_MODE_LOWER;
1772: diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1773: cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
1774: cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode));
1775: cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType));
1777: /* ========================================================================= */
1778: /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */
1779: /* ========================================================================= */
1780: cusparseCreateCsric02Info(&fs->ic0Info_M);
1781: if (m) cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M);
1783: cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m);
1784: cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m);
1786: cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype);
1787: cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype);
1789: cusparseSpSV_createDescr(&fs->spsvDescr_L);
1790: cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L);
1792: cusparseSpSV_createDescr(&fs->spsvDescr_Lt);
1793: cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt);
1795: /* To save device memory, we make the factorization buffer share with one of the solver buffer.
1796: See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
1797: */
1798: if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
1799: cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M));
1800: fs->spsvBuffer_L = fs->factBuffer_M;
1801: cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt);
1802: } else {
1803: cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M));
1804: fs->spsvBuffer_Lt = fs->factBuffer_M;
1805: cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L);
1806: }
1808: /* ========================================================================== */
1809: /* Perform analysis of ic0 on M */
1810: /* The lower triangular part of M has the same sparsity pattern as L */
1811: /* ========================================================================== */
1812: int structural_zero;
1813: cusparseStatus_t status;
1815: fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1816: if (m) cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M);
1817: if (PetscDefined(USE_DEBUG)) {
1818: /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1819: status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1821: }
1823: /* Estimate FLOPs of the numeric factorization */
1824: {
1825: Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data;
1826: PetscInt *Ai, nzRow, nzLeft;
1827: PetscLogDouble flops = 0.0;
1829: Ai = Aseq->i;
1830: for (PetscInt i = 0; i < m; i++) {
1831: nzRow = Ai[i + 1] - Ai[i];
1832: if (nzRow > 1) {
1833: /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1834: and include the eliminated one will be updated, which incurs a multiplication and an addition.
1835: */
1836: nzLeft = (nzRow - 1) / 2;
1837: flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1838: }
1839: }
1840: fs->numericFactFlops = flops;
1841: }
1842: fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1843: return 0;
1844: }
1845: #endif
1847: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1848: {
1849: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1851: #if CUSPARSE_VERSION >= 11500
1852: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1853: if (cusparseTriFactors->factorizeOnDevice) {
1854: ISIdentity(isrow, &row_identity);
1855: ISIdentity(iscol, &col_identity);
1856: }
1857: if (!info->levels && row_identity && col_identity) {
1858: MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info);
1859: } else
1860: #endif
1861: {
1862: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1863: MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1864: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1865: }
1866: return 0;
1867: }
1869: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1870: {
1871: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1873: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1874: MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info);
1875: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1876: return 0;
1877: }
1879: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1880: {
1881: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1883: #if CUSPARSE_VERSION >= 11500
1884: PetscBool perm_identity = PETSC_FALSE;
1885: if (cusparseTriFactors->factorizeOnDevice) ISIdentity(perm, &perm_identity);
1886: if (!info->levels && perm_identity) {
1887: MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info);
1888: } else
1889: #endif
1890: {
1891: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1892: MatICCFactorSymbolic_SeqAIJ(B, A, perm, info);
1893: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1894: }
1895: return 0;
1896: }
1898: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1899: {
1900: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1902: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
1903: MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info);
1904: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1905: return 0;
1906: }
1908: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
1909: {
1910: *type = MATSOLVERCUSPARSE;
1911: return 0;
1912: }
1914: /*MC
1915: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1916: on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1917: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1918: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1919: CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1920: algorithms are not recommended. This class does NOT support direct solver operations.
1922: Level: beginner
1924: .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1925: M*/
1927: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1928: {
1929: PetscInt n = A->rmap->n;
1930: PetscBool factOnDevice, factOnHost;
1931: char *prefix;
1932: char factPlace[32] = "device"; /* the default */
1934: MatCreate(PetscObjectComm((PetscObject)A), B);
1935: MatSetSizes(*B, n, n, n, n);
1936: (*B)->factortype = ftype;
1937: MatSetType(*B, MATSEQAIJCUSPARSE);
1939: prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1940: PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1941: PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL);
1942: PetscOptionsEnd();
1943: PetscStrcasecmp("device", factPlace, &factOnDevice);
1944: PetscStrcasecmp("host", factPlace, &factOnHost);
1946: ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1948: if (A->boundtocpu && A->bindingpropagates) MatBindToCPU(*B, PETSC_TRUE);
1949: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1950: MatSetBlockSizesFromMats(*B, A, A);
1951: if (!A->boundtocpu) {
1952: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1953: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1954: } else {
1955: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1956: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ;
1957: }
1958: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]);
1959: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]);
1960: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
1961: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1962: if (!A->boundtocpu) {
1963: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1964: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1965: } else {
1966: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ;
1967: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1968: }
1969: PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
1970: PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]);
1971: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
1973: MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL);
1974: (*B)->canuseordering = PETSC_TRUE;
1975: PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse);
1976: return 0;
1977: }
1979: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1980: {
1981: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1982: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1983: #if CUSPARSE_VERSION >= 13500
1984: Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1985: #endif
1987: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1988: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0);
1989: if (A->factortype == MAT_FACTOR_NONE) {
1990: CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
1991: cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1992: }
1993: #if CUSPARSE_VERSION >= 13500
1994: else if (fs->csrVal) {
1995: /* We have a factorized matrix on device and are able to copy it to host */
1996: cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1997: }
1998: #endif
1999: else
2000: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
2001: PetscLogGpuToCpu(a->nz * sizeof(PetscScalar));
2002: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0);
2003: A->offloadmask = PETSC_OFFLOAD_BOTH;
2004: }
2005: return 0;
2006: }
2008: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2009: {
2010: MatSeqAIJCUSPARSECopyFromGPU(A);
2011: *array = ((Mat_SeqAIJ *)A->data)->a;
2012: return 0;
2013: }
2015: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2016: {
2017: A->offloadmask = PETSC_OFFLOAD_CPU;
2018: *array = NULL;
2019: return 0;
2020: }
2022: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2023: {
2024: MatSeqAIJCUSPARSECopyFromGPU(A);
2025: *array = ((Mat_SeqAIJ *)A->data)->a;
2026: return 0;
2027: }
2029: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2030: {
2031: *array = NULL;
2032: return 0;
2033: }
2035: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2036: {
2037: *array = ((Mat_SeqAIJ *)A->data)->a;
2038: return 0;
2039: }
2041: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2042: {
2043: A->offloadmask = PETSC_OFFLOAD_CPU;
2044: *array = NULL;
2045: return 0;
2046: }
2048: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2049: {
2050: Mat_SeqAIJCUSPARSE *cusp;
2051: CsrMatrix *matrix;
2053: MatSeqAIJCUSPARSECopyToGPU(A);
2055: cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2057: matrix = (CsrMatrix *)cusp->mat->mat;
2059: if (i) {
2060: #if !defined(PETSC_USE_64BIT_INDICES)
2061: *i = matrix->row_offsets->data().get();
2062: #else
2063: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2064: #endif
2065: }
2066: if (j) {
2067: #if !defined(PETSC_USE_64BIT_INDICES)
2068: *j = matrix->column_indices->data().get();
2069: #else
2070: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
2071: #endif
2072: }
2073: if (a) *a = matrix->values->data().get();
2074: if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2075: return 0;
2076: }
2078: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2079: {
2080: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2081: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2082: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2083: PetscInt m = A->rmap->n, *ii, *ridx, tmp;
2084: cusparseStatus_t stat;
2085: PetscBool both = PETSC_TRUE;
2088: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2089: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2090: CsrMatrix *matrix;
2091: matrix = (CsrMatrix *)cusparsestruct->mat->mat;
2094: PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2095: matrix->values->assign(a->a, a->a + a->nz);
2096: WaitForCUDA();
2097: PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar));
2098: PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2099: MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
2100: } else {
2101: PetscInt nnz;
2102: PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2103: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format);
2104: MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
2105: delete cusparsestruct->workVector;
2106: delete cusparsestruct->rowoffsets_gpu;
2107: cusparsestruct->workVector = NULL;
2108: cusparsestruct->rowoffsets_gpu = NULL;
2109: try {
2110: if (a->compressedrow.use) {
2111: m = a->compressedrow.nrows;
2112: ii = a->compressedrow.i;
2113: ridx = a->compressedrow.rindex;
2114: } else {
2115: m = A->rmap->n;
2116: ii = a->i;
2117: ridx = NULL;
2118: }
2120: if (!a->a) {
2121: nnz = ii[m];
2122: both = PETSC_FALSE;
2123: } else nnz = a->nz;
2126: /* create cusparse matrix */
2127: cusparsestruct->nrows = m;
2128: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2129: cusparseCreateMatDescr(&matstruct->descr);
2130: cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);
2131: cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
2133: cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));
2134: cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar));
2135: cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));
2136: cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2137: cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2138: cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2139: cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);
2141: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2142: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2143: /* set the matrix */
2144: CsrMatrix *mat = new CsrMatrix;
2145: mat->num_rows = m;
2146: mat->num_cols = A->cmap->n;
2147: mat->num_entries = nnz;
2148: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2149: mat->row_offsets->assign(ii, ii + m + 1);
2151: mat->column_indices = new THRUSTINTARRAY32(nnz);
2152: mat->column_indices->assign(a->j, a->j + nnz);
2154: mat->values = new THRUSTARRAY(nnz);
2155: if (a->a) mat->values->assign(a->a, a->a + nnz);
2157: /* assign the pointer */
2158: matstruct->mat = mat;
2159: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2160: if (mat->num_rows) { /* cusparse errors on empty matrices! */
2161: stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2162: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2163: stat;
2164: }
2165: #endif
2166: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2167: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2168: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2169: #else
2170: CsrMatrix *mat = new CsrMatrix;
2171: mat->num_rows = m;
2172: mat->num_cols = A->cmap->n;
2173: mat->num_entries = nnz;
2174: mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2175: mat->row_offsets->assign(ii, ii + m + 1);
2177: mat->column_indices = new THRUSTINTARRAY32(nnz);
2178: mat->column_indices->assign(a->j, a->j + nnz);
2180: mat->values = new THRUSTARRAY(nnz);
2181: if (a->a) mat->values->assign(a->a, a->a + nnz);
2183: cusparseHybMat_t hybMat;
2184: cusparseCreateHybMat(&hybMat);
2185: cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2186: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
2187: stat;
2188: /* assign the pointer */
2189: matstruct->mat = hybMat;
2191: if (mat) {
2192: if (mat->values) delete (THRUSTARRAY *)mat->values;
2193: if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2194: if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2195: delete (CsrMatrix *)mat;
2196: }
2197: #endif
2198: }
2200: /* assign the compressed row indices */
2201: if (a->compressedrow.use) {
2202: cusparsestruct->workVector = new THRUSTARRAY(m);
2203: matstruct->cprowIndices = new THRUSTINTARRAY(m);
2204: matstruct->cprowIndices->assign(ridx, ridx + m);
2205: tmp = m;
2206: } else {
2207: cusparsestruct->workVector = NULL;
2208: matstruct->cprowIndices = NULL;
2209: tmp = 0;
2210: }
2211: PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar));
2213: /* assign the pointer */
2214: cusparsestruct->mat = matstruct;
2215: } catch (char *ex) {
2216: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2217: }
2218: WaitForCUDA();
2219: PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0);
2220: cusparsestruct->nonzerostate = A->nonzerostate;
2221: }
2222: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2223: }
2224: return 0;
2225: }
2227: struct VecCUDAPlusEquals {
2228: template <typename Tuple>
2229: __host__ __device__ void operator()(Tuple t)
2230: {
2231: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2232: }
2233: };
2235: struct VecCUDAEquals {
2236: template <typename Tuple>
2237: __host__ __device__ void operator()(Tuple t)
2238: {
2239: thrust::get<1>(t) = thrust::get<0>(t);
2240: }
2241: };
2243: struct VecCUDAEqualsReverse {
2244: template <typename Tuple>
2245: __host__ __device__ void operator()(Tuple t)
2246: {
2247: thrust::get<0>(t) = thrust::get<1>(t);
2248: }
2249: };
2251: struct MatMatCusparse {
2252: PetscBool cisdense;
2253: PetscScalar *Bt;
2254: Mat X;
2255: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2256: PetscLogDouble flops;
2257: CsrMatrix *Bcsr;
2259: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2260: cusparseSpMatDescr_t matSpBDescr;
2261: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
2262: cusparseDnMatDescr_t matBDescr;
2263: cusparseDnMatDescr_t matCDescr;
2264: PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2265: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2266: void *dBuffer4;
2267: void *dBuffer5;
2268: #endif
2269: size_t mmBufferSize;
2270: void *mmBuffer;
2271: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2272: cusparseSpGEMMDescr_t spgemmDesc;
2273: #endif
2274: };
2276: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2277: {
2278: MatMatCusparse *mmdata = (MatMatCusparse *)data;
2280: cudaFree(mmdata->Bt);
2281: delete mmdata->Bcsr;
2282: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2283: if (mmdata->matSpBDescr) cusparseDestroySpMat(mmdata->matSpBDescr);
2284: if (mmdata->matBDescr) cusparseDestroyDnMat(mmdata->matBDescr);
2285: if (mmdata->matCDescr) cusparseDestroyDnMat(mmdata->matCDescr);
2286: if (mmdata->spgemmDesc) cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);
2287: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2288: if (mmdata->dBuffer4) cudaFree(mmdata->dBuffer4);
2289: if (mmdata->dBuffer5) cudaFree(mmdata->dBuffer5);
2290: #endif
2291: if (mmdata->mmBuffer) cudaFree(mmdata->mmBuffer);
2292: if (mmdata->mmBuffer2) cudaFree(mmdata->mmBuffer2);
2293: #endif
2294: MatDestroy(&mmdata->X);
2295: PetscFree(data);
2296: return 0;
2297: }
2299: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2301: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2302: {
2303: Mat_Product *product = C->product;
2304: Mat A, B;
2305: PetscInt m, n, blda, clda;
2306: PetscBool flg, biscuda;
2307: Mat_SeqAIJCUSPARSE *cusp;
2308: cusparseStatus_t stat;
2309: cusparseOperation_t opA;
2310: const PetscScalar *barray;
2311: PetscScalar *carray;
2312: MatMatCusparse *mmdata;
2313: Mat_SeqAIJCUSPARSEMultStruct *mat;
2314: CsrMatrix *csrmat;
2316: MatCheckProduct(C, 1);
2318: mmdata = (MatMatCusparse *)product->data;
2319: A = product->A;
2320: B = product->B;
2321: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2323: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2324: Instead of silently accepting the wrong answer, I prefer to raise the error */
2326: MatSeqAIJCUSPARSECopyToGPU(A);
2327: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2328: switch (product->type) {
2329: case MATPRODUCT_AB:
2330: case MATPRODUCT_PtAP:
2331: mat = cusp->mat;
2332: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2333: m = A->rmap->n;
2334: n = B->cmap->n;
2335: break;
2336: case MATPRODUCT_AtB:
2337: if (!A->form_explicit_transpose) {
2338: mat = cusp->mat;
2339: opA = CUSPARSE_OPERATION_TRANSPOSE;
2340: } else {
2341: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2342: mat = cusp->matTranspose;
2343: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2344: }
2345: m = A->cmap->n;
2346: n = B->cmap->n;
2347: break;
2348: case MATPRODUCT_ABt:
2349: case MATPRODUCT_RARt:
2350: mat = cusp->mat;
2351: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2352: m = A->rmap->n;
2353: n = B->rmap->n;
2354: break;
2355: default:
2356: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2357: }
2359: csrmat = (CsrMatrix *)mat->mat;
2360: /* if the user passed a CPU matrix, copy the data to the GPU */
2361: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda);
2362: if (!biscuda) MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
2363: MatDenseCUDAGetArrayRead(B, &barray);
2365: MatDenseGetLDA(B, &blda);
2366: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2367: MatDenseCUDAGetArrayWrite(mmdata->X, &carray);
2368: MatDenseGetLDA(mmdata->X, &clda);
2369: } else {
2370: MatDenseCUDAGetArrayWrite(C, &carray);
2371: MatDenseGetLDA(C, &clda);
2372: }
2374: PetscLogGpuTimeBegin();
2375: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2376: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2377: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2378: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2379: size_t mmBufferSize;
2380: if (mmdata->initialized && mmdata->Blda != blda) {
2381: cusparseDestroyDnMat(mmdata->matBDescr);
2382: mmdata->matBDescr = NULL;
2383: }
2384: if (!mmdata->matBDescr) {
2385: cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL);
2386: mmdata->Blda = blda;
2387: }
2389: if (mmdata->initialized && mmdata->Clda != clda) {
2390: cusparseDestroyDnMat(mmdata->matCDescr);
2391: mmdata->matCDescr = NULL;
2392: }
2393: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2394: cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL);
2395: mmdata->Clda = clda;
2396: }
2398: if (!mat->matDescr) {
2399: stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2400: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2401: stat;
2402: }
2403: stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
2404: stat;
2405: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2406: cudaFree(mmdata->mmBuffer);
2407: cudaMalloc(&mmdata->mmBuffer, mmBufferSize);
2408: mmdata->mmBufferSize = mmBufferSize;
2409: }
2410: mmdata->initialized = PETSC_TRUE;
2411: } else {
2412: /* to be safe, always update pointers of the mats */
2413: cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get());
2414: cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray);
2415: cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray);
2416: }
2418: /* do cusparseSpMM, which supports transpose on B */
2419: stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
2420: stat;
2421: #else
2422: PetscInt k;
2423: /* cusparseXcsrmm does not support transpose on B */
2424: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2425: cublasHandle_t cublasv2handle;
2426: cublasStatus_t cerr;
2428: PetscCUBLASGetHandle(&cublasv2handle);
2429: cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
2430: cerr;
2431: blda = B->cmap->n;
2432: k = B->cmap->n;
2433: } else {
2434: k = B->rmap->n;
2435: }
2437: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2438: stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
2439: stat;
2440: #endif
2441: PetscLogGpuTimeEnd();
2442: PetscLogGpuFlops(n * 2.0 * csrmat->num_entries);
2443: MatDenseCUDARestoreArrayRead(B, &barray);
2444: if (product->type == MATPRODUCT_RARt) {
2445: MatDenseCUDARestoreArrayWrite(mmdata->X, &carray);
2446: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE);
2447: } else if (product->type == MATPRODUCT_PtAP) {
2448: MatDenseCUDARestoreArrayWrite(mmdata->X, &carray);
2449: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE);
2450: } else {
2451: MatDenseCUDARestoreArrayWrite(C, &carray);
2452: }
2453: if (mmdata->cisdense) MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C);
2454: if (!biscuda) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
2455: return 0;
2456: }
2458: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2459: {
2460: Mat_Product *product = C->product;
2461: Mat A, B;
2462: PetscInt m, n;
2463: PetscBool cisdense, flg;
2464: MatMatCusparse *mmdata;
2465: Mat_SeqAIJCUSPARSE *cusp;
2467: MatCheckProduct(C, 1);
2469: A = product->A;
2470: B = product->B;
2471: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2473: cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2475: switch (product->type) {
2476: case MATPRODUCT_AB:
2477: m = A->rmap->n;
2478: n = B->cmap->n;
2479: break;
2480: case MATPRODUCT_AtB:
2481: m = A->cmap->n;
2482: n = B->cmap->n;
2483: break;
2484: case MATPRODUCT_ABt:
2485: m = A->rmap->n;
2486: n = B->rmap->n;
2487: break;
2488: case MATPRODUCT_PtAP:
2489: m = B->cmap->n;
2490: n = B->cmap->n;
2491: break;
2492: case MATPRODUCT_RARt:
2493: m = B->rmap->n;
2494: n = B->rmap->n;
2495: break;
2496: default:
2497: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2498: }
2499: MatSetSizes(C, m, n, m, n);
2500: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2501: PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense);
2502: MatSetType(C, MATSEQDENSECUDA);
2504: /* product data */
2505: PetscNew(&mmdata);
2506: mmdata->cisdense = cisdense;
2507: #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2508: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2509: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar));
2510: #endif
2511: /* for these products we need intermediate storage */
2512: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2513: MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X);
2514: MatSetType(mmdata->X, MATSEQDENSECUDA);
2515: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2516: MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n);
2517: } else {
2518: MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n);
2519: }
2520: }
2521: C->product->data = mmdata;
2522: C->product->destroy = MatDestroy_MatMatCusparse;
2524: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2525: return 0;
2526: }
2528: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2529: {
2530: Mat_Product *product = C->product;
2531: Mat A, B;
2532: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2533: Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
2534: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2535: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2536: PetscBool flg;
2537: cusparseStatus_t stat;
2538: MatProductType ptype;
2539: MatMatCusparse *mmdata;
2540: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2541: cusparseSpMatDescr_t BmatSpDescr;
2542: #endif
2543: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2545: MatCheckProduct(C, 1);
2547: PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg);
2549: mmdata = (MatMatCusparse *)C->product->data;
2550: A = product->A;
2551: B = product->B;
2552: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2553: mmdata->reusesym = PETSC_FALSE;
2554: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2556: Cmat = Ccusp->mat;
2558: Ccsr = (CsrMatrix *)Cmat->mat;
2560: goto finalize;
2561: }
2562: if (!c->nz) goto finalize;
2563: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2565: PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg);
2569: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2570: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2571: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2575: MatSeqAIJCUSPARSECopyToGPU(A);
2576: MatSeqAIJCUSPARSECopyToGPU(B);
2578: ptype = product->type;
2579: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2580: ptype = MATPRODUCT_AB;
2582: }
2583: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2584: ptype = MATPRODUCT_AB;
2586: }
2587: switch (ptype) {
2588: case MATPRODUCT_AB:
2589: Amat = Acusp->mat;
2590: Bmat = Bcusp->mat;
2591: break;
2592: case MATPRODUCT_AtB:
2593: Amat = Acusp->matTranspose;
2594: Bmat = Bcusp->mat;
2595: break;
2596: case MATPRODUCT_ABt:
2597: Amat = Acusp->mat;
2598: Bmat = Bcusp->matTranspose;
2599: break;
2600: default:
2601: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2602: }
2603: Cmat = Ccusp->mat;
2607: Acsr = (CsrMatrix *)Amat->mat;
2608: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2609: Ccsr = (CsrMatrix *)Cmat->mat;
2613: PetscLogGpuTimeBegin();
2614: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2615: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2616: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2617: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2618: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2619: stat;
2620: #else
2621: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2622: stat;
2623: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2624: stat;
2625: #endif
2626: #else
2627: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2628: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2629: stat;
2630: #endif
2631: PetscLogGpuFlops(mmdata->flops);
2632: WaitForCUDA();
2633: PetscLogGpuTimeEnd();
2634: C->offloadmask = PETSC_OFFLOAD_GPU;
2635: finalize:
2636: /* shorter version of MatAssemblyEnd_SeqAIJ */
2637: PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz);
2638: PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n");
2639: PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax);
2640: c->reallocs = 0;
2641: C->info.mallocs += 0;
2642: C->info.nz_unneeded = 0;
2643: C->assembled = C->was_assembled = PETSC_TRUE;
2644: C->num_ass++;
2645: return 0;
2646: }
2648: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2649: {
2650: Mat_Product *product = C->product;
2651: Mat A, B;
2652: Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp;
2653: Mat_SeqAIJ *a, *b, *c;
2654: Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2655: CsrMatrix *Acsr, *Bcsr, *Ccsr;
2656: PetscInt i, j, m, n, k;
2657: PetscBool flg;
2658: cusparseStatus_t stat;
2659: MatProductType ptype;
2660: MatMatCusparse *mmdata;
2661: PetscLogDouble flops;
2662: PetscBool biscompressed, ciscompressed;
2663: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2664: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2665: cusparseSpMatDescr_t BmatSpDescr;
2666: #else
2667: int cnz;
2668: #endif
2669: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2671: MatCheckProduct(C, 1);
2673: A = product->A;
2674: B = product->B;
2675: PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg);
2677: PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg);
2679: a = (Mat_SeqAIJ *)A->data;
2680: b = (Mat_SeqAIJ *)B->data;
2681: /* product data */
2682: PetscNew(&mmdata);
2683: C->product->data = mmdata;
2684: C->product->destroy = MatDestroy_MatMatCusparse;
2686: MatSeqAIJCUSPARSECopyToGPU(A);
2687: MatSeqAIJCUSPARSECopyToGPU(B);
2688: Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2689: Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2693: ptype = product->type;
2694: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2695: ptype = MATPRODUCT_AB;
2696: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2697: }
2698: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2699: ptype = MATPRODUCT_AB;
2700: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2701: }
2702: biscompressed = PETSC_FALSE;
2703: ciscompressed = PETSC_FALSE;
2704: switch (ptype) {
2705: case MATPRODUCT_AB:
2706: m = A->rmap->n;
2707: n = B->cmap->n;
2708: k = A->cmap->n;
2709: Amat = Acusp->mat;
2710: Bmat = Bcusp->mat;
2711: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2712: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2713: break;
2714: case MATPRODUCT_AtB:
2715: m = A->cmap->n;
2716: n = B->cmap->n;
2717: k = A->rmap->n;
2718: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
2719: Amat = Acusp->matTranspose;
2720: Bmat = Bcusp->mat;
2721: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2722: break;
2723: case MATPRODUCT_ABt:
2724: m = A->rmap->n;
2725: n = B->rmap->n;
2726: k = A->cmap->n;
2727: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
2728: Amat = Acusp->mat;
2729: Bmat = Bcusp->matTranspose;
2730: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2731: break;
2732: default:
2733: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2734: }
2736: /* create cusparse matrix */
2737: MatSetSizes(C, m, n, m, n);
2738: MatSetType(C, MATSEQAIJCUSPARSE);
2739: c = (Mat_SeqAIJ *)C->data;
2740: Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2741: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2742: Ccsr = new CsrMatrix;
2744: c->compressedrow.use = ciscompressed;
2745: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2746: c->compressedrow.nrows = a->compressedrow.nrows;
2747: PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex);
2748: PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows);
2749: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2750: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2751: Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2752: } else {
2753: c->compressedrow.nrows = 0;
2754: c->compressedrow.i = NULL;
2755: c->compressedrow.rindex = NULL;
2756: Ccusp->workVector = NULL;
2757: Cmat->cprowIndices = NULL;
2758: }
2759: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2760: Ccusp->mat = Cmat;
2761: Ccusp->mat->mat = Ccsr;
2762: Ccsr->num_rows = Ccusp->nrows;
2763: Ccsr->num_cols = n;
2764: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
2765: cusparseCreateMatDescr(&Cmat->descr);
2766: cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
2767: cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
2768: cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
2769: cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
2770: cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
2771: cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2772: cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2773: cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
2774: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2775: thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2776: c->nz = 0;
2777: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2778: Ccsr->values = new THRUSTARRAY(c->nz);
2779: goto finalizesym;
2780: }
2784: Acsr = (CsrMatrix *)Amat->mat;
2785: if (!biscompressed) {
2786: Bcsr = (CsrMatrix *)Bmat->mat;
2787: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2788: BmatSpDescr = Bmat->matDescr;
2789: #endif
2790: } else { /* we need to use row offsets for the full matrix */
2791: CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat;
2792: Bcsr = new CsrMatrix;
2793: Bcsr->num_rows = B->rmap->n;
2794: Bcsr->num_cols = cBcsr->num_cols;
2795: Bcsr->num_entries = cBcsr->num_entries;
2796: Bcsr->column_indices = cBcsr->column_indices;
2797: Bcsr->values = cBcsr->values;
2798: if (!Bcusp->rowoffsets_gpu) {
2799: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2800: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
2801: PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
2802: }
2803: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2804: mmdata->Bcsr = Bcsr;
2805: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2806: if (Bcsr->num_rows && Bcsr->num_cols) {
2807: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2808: stat;
2809: }
2810: BmatSpDescr = mmdata->matSpBDescr;
2811: #endif
2812: }
2815: /* precompute flops count */
2816: if (ptype == MATPRODUCT_AB) {
2817: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2818: const PetscInt st = a->i[i];
2819: const PetscInt en = a->i[i + 1];
2820: for (j = st; j < en; j++) {
2821: const PetscInt brow = a->j[j];
2822: flops += 2. * (b->i[brow + 1] - b->i[brow]);
2823: }
2824: }
2825: } else if (ptype == MATPRODUCT_AtB) {
2826: for (i = 0, flops = 0; i < A->rmap->n; i++) {
2827: const PetscInt anzi = a->i[i + 1] - a->i[i];
2828: const PetscInt bnzi = b->i[i + 1] - b->i[i];
2829: flops += (2. * anzi) * bnzi;
2830: }
2831: } else { /* TODO */
2832: flops = 0.;
2833: }
2835: mmdata->flops = flops;
2836: PetscLogGpuTimeBegin();
2838: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2839: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2840: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
2841: stat;
2842: cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);
2843: #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2844: {
2845: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2846: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2847: */
2848: void *dBuffer1 = NULL;
2849: void *dBuffer2 = NULL;
2850: void *dBuffer3 = NULL;
2851: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2852: size_t bufferSize1 = 0;
2853: size_t bufferSize2 = 0;
2854: size_t bufferSize3 = 0;
2855: size_t bufferSize4 = 0;
2856: size_t bufferSize5 = 0;
2858: /*----------------------------------------------------------------------*/
2859: /* ask bufferSize1 bytes for external memory */
2860: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
2861: stat;
2862: cudaMalloc((void **)&dBuffer1, bufferSize1);
2863: /* inspect the matrices A and B to understand the memory requirement for the next step */
2864: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
2865: stat;
2867: /*----------------------------------------------------------------------*/
2868: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
2869: stat;
2870: cudaMalloc((void **)&dBuffer2, bufferSize2);
2871: cudaMalloc((void **)&dBuffer3, bufferSize3);
2872: cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4);
2873: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
2874: stat;
2875: cudaFree(dBuffer1);
2876: cudaFree(dBuffer2);
2878: /*----------------------------------------------------------------------*/
2879: /* get matrix C non-zero entries C_nnz1 */
2880: cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2881: c->nz = (PetscInt)C_nnz1;
2882: /* allocate matrix C */
2883: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2884: cudaPeekAtLastError(); /* catch out of memory errors */
2885: Ccsr->values = new THRUSTARRAY(c->nz);
2886: cudaPeekAtLastError(); /* catch out of memory errors */
2887: /* update matC with the new pointers */
2888: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2889: stat;
2891: /*----------------------------------------------------------------------*/
2892: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
2893: stat;
2894: cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5);
2895: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
2896: stat;
2897: cudaFree(dBuffer3);
2898: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2899: stat;
2900: PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024);
2901: }
2902: #else
2903: size_t bufSize2;
2904: /* ask bufferSize bytes for external memory */
2905: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
2906: stat;
2907: cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2);
2908: /* inspect the matrices A and B to understand the memory requirement for the next step */
2909: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
2910: stat;
2911: /* ask bufferSize again bytes for external memory */
2912: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
2913: stat;
2914: /* The CUSPARSE documentation is not clear, nor the API
2915: We need both buffers to perform the operations properly!
2916: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2917: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2918: is stored in the descriptor! What a messy API... */
2919: cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize);
2920: /* compute the intermediate product of A * B */
2921: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
2922: stat;
2923: /* get matrix C non-zero entries C_nnz1 */
2924: cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);
2925: c->nz = (PetscInt)C_nnz1;
2926: PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
2927: mmdata->mmBufferSize / 1024));
2928: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2929: cudaPeekAtLastError(); /* catch out of memory errors */
2930: Ccsr->values = new THRUSTARRAY(c->nz);
2931: cudaPeekAtLastError(); /* catch out of memory errors */
2932: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
2933: stat;
2934: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
2935: stat;
2936: #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2937: #else
2938: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);
2939: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2940: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
2941: stat;
2942: c->nz = cnz;
2943: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2944: cudaPeekAtLastError(); /* catch out of memory errors */
2945: Ccsr->values = new THRUSTARRAY(c->nz);
2946: cudaPeekAtLastError(); /* catch out of memory errors */
2948: cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);
2949: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2950: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2951: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2952: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
2953: Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
2954: stat;
2955: #endif
2956: PetscLogGpuFlops(mmdata->flops);
2957: PetscLogGpuTimeEnd();
2958: finalizesym:
2959: c->singlemalloc = PETSC_FALSE;
2960: c->free_a = PETSC_TRUE;
2961: c->free_ij = PETSC_TRUE;
2962: PetscMalloc1(m + 1, &c->i);
2963: PetscMalloc1(c->nz, &c->j);
2964: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2965: PetscInt *d_i = c->i;
2966: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2967: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2968: ii = *Ccsr->row_offsets;
2969: jj = *Ccsr->column_indices;
2970: if (ciscompressed) d_i = c->compressedrow.i;
2971: cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2972: cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2973: } else {
2974: PetscInt *d_i = c->i;
2975: if (ciscompressed) d_i = c->compressedrow.i;
2976: cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2977: cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
2978: }
2979: if (ciscompressed) { /* need to expand host row offsets */
2980: PetscInt r = 0;
2981: c->i[0] = 0;
2982: for (k = 0; k < c->compressedrow.nrows; k++) {
2983: const PetscInt next = c->compressedrow.rindex[k];
2984: const PetscInt old = c->compressedrow.i[k];
2985: for (; r < next; r++) c->i[r + 1] = old;
2986: }
2987: for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
2988: }
2989: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
2990: PetscMalloc1(m, &c->ilen);
2991: PetscMalloc1(m, &c->imax);
2992: c->maxnz = c->nz;
2993: c->nonzerorowcnt = 0;
2994: c->rmax = 0;
2995: for (k = 0; k < m; k++) {
2996: const PetscInt nn = c->i[k + 1] - c->i[k];
2997: c->ilen[k] = c->imax[k] = nn;
2998: c->nonzerorowcnt += (PetscInt) !!nn;
2999: c->rmax = PetscMax(c->rmax, nn);
3000: }
3001: MatMarkDiagonal_SeqAIJ(C);
3002: PetscMalloc1(c->nz, &c->a);
3003: Ccsr->num_entries = c->nz;
3005: C->nonzerostate++;
3006: PetscLayoutSetUp(C->rmap);
3007: PetscLayoutSetUp(C->cmap);
3008: Ccusp->nonzerostate = C->nonzerostate;
3009: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3010: C->preallocated = PETSC_TRUE;
3011: C->assembled = PETSC_FALSE;
3012: C->was_assembled = PETSC_FALSE;
3013: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3014: mmdata->reusesym = PETSC_TRUE;
3015: C->offloadmask = PETSC_OFFLOAD_GPU;
3016: }
3017: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3018: return 0;
3019: }
3021: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3023: /* handles sparse or dense B */
3024: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3025: {
3026: Mat_Product *product = mat->product;
3027: PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3029: MatCheckProduct(mat, 1);
3030: PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense);
3031: if (!product->A->boundtocpu && !product->B->boundtocpu) PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp);
3032: if (product->type == MATPRODUCT_ABC) {
3033: Ciscusp = PETSC_FALSE;
3034: if (!product->C->boundtocpu) PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp);
3035: }
3036: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3037: PetscBool usecpu = PETSC_FALSE;
3038: switch (product->type) {
3039: case MATPRODUCT_AB:
3040: if (product->api_user) {
3041: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
3042: PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
3043: PetscOptionsEnd();
3044: } else {
3045: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
3046: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL);
3047: PetscOptionsEnd();
3048: }
3049: break;
3050: case MATPRODUCT_AtB:
3051: if (product->api_user) {
3052: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
3053: PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
3054: PetscOptionsEnd();
3055: } else {
3056: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
3057: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL);
3058: PetscOptionsEnd();
3059: }
3060: break;
3061: case MATPRODUCT_PtAP:
3062: if (product->api_user) {
3063: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
3064: PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
3065: PetscOptionsEnd();
3066: } else {
3067: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
3068: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL);
3069: PetscOptionsEnd();
3070: }
3071: break;
3072: case MATPRODUCT_RARt:
3073: if (product->api_user) {
3074: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
3075: PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
3076: PetscOptionsEnd();
3077: } else {
3078: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
3079: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL);
3080: PetscOptionsEnd();
3081: }
3082: break;
3083: case MATPRODUCT_ABC:
3084: if (product->api_user) {
3085: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
3086: PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
3087: PetscOptionsEnd();
3088: } else {
3089: PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
3090: PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL);
3091: PetscOptionsEnd();
3092: }
3093: break;
3094: default:
3095: break;
3096: }
3097: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3098: }
3099: /* dispatch */
3100: if (isdense) {
3101: switch (product->type) {
3102: case MATPRODUCT_AB:
3103: case MATPRODUCT_AtB:
3104: case MATPRODUCT_ABt:
3105: case MATPRODUCT_PtAP:
3106: case MATPRODUCT_RARt:
3107: if (product->A->boundtocpu) {
3108: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
3109: } else {
3110: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3111: }
3112: break;
3113: case MATPRODUCT_ABC:
3114: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3115: break;
3116: default:
3117: break;
3118: }
3119: } else if (Biscusp && Ciscusp) {
3120: switch (product->type) {
3121: case MATPRODUCT_AB:
3122: case MATPRODUCT_AtB:
3123: case MATPRODUCT_ABt:
3124: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3125: break;
3126: case MATPRODUCT_PtAP:
3127: case MATPRODUCT_RARt:
3128: case MATPRODUCT_ABC:
3129: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3130: break;
3131: default:
3132: break;
3133: }
3134: } else { /* fallback for AIJ */
3135: MatProductSetFromOptions_SeqAIJ(mat);
3136: }
3137: return 0;
3138: }
3140: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3141: {
3142: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE);
3143: return 0;
3144: }
3146: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3147: {
3148: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE);
3149: return 0;
3150: }
3152: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3153: {
3154: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE);
3155: return 0;
3156: }
3158: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3159: {
3160: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE);
3161: return 0;
3162: }
3164: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3165: {
3166: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE);
3167: return 0;
3168: }
3170: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3171: {
3172: int i = blockIdx.x * blockDim.x + threadIdx.x;
3173: if (i < n) y[idx[i]] += x[i];
3174: }
3176: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3177: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3178: {
3179: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3180: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
3181: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3182: PetscScalar *xarray, *zarray, *dptr, *beta, *xptr;
3183: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3184: PetscBool compressed;
3185: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3186: PetscInt nx, ny;
3187: #endif
3190: if (!a->nz) {
3191: if (!yy) VecSet_SeqCUDA(zz, 0);
3192: else VecCopy_SeqCUDA(yy, zz);
3193: return 0;
3194: }
3195: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3196: MatSeqAIJCUSPARSECopyToGPU(A);
3197: if (!trans) {
3198: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3200: } else {
3201: if (herm || !A->form_explicit_transpose) {
3202: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3203: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3204: } else {
3205: if (!cusparsestruct->matTranspose) MatSeqAIJCUSPARSEFormExplicitTranspose(A);
3206: matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3207: }
3208: }
3209: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3210: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3212: try {
3213: VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray);
3214: if (yy == zz) VecCUDAGetArray(zz, &zarray); /* read & write zz, so need to get uptodate zarray on GPU */
3215: else VecCUDAGetArrayWrite(zz, &zarray); /* write zz, so no need to init zarray on GPU */
3217: PetscLogGpuTimeBegin();
3218: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3219: /* z = A x + beta y.
3220: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3221: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3222: */
3223: xptr = xarray;
3224: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3225: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3226: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3227: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3228: allocated to accommodate different uses. So we get the length info directly from mat.
3229: */
3230: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3231: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3232: nx = mat->num_cols;
3233: ny = mat->num_rows;
3234: }
3235: #endif
3236: } else {
3237: /* z = A^T x + beta y
3238: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3239: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3240: */
3241: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3242: dptr = zarray;
3243: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3244: if (compressed) { /* Scatter x to work vector */
3245: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3247: thrust::for_each(
3248: #if PetscDefined(HAVE_THRUST_ASYNC)
3249: thrust::cuda::par.on(PetscDefaultCudaStream),
3250: #endif
3251: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3252: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3253: }
3254: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3255: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3256: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3257: nx = mat->num_rows;
3258: ny = mat->num_cols;
3259: }
3260: #endif
3261: }
3263: /* csr_spmv does y = alpha op(A) x + beta y */
3264: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3265: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3267: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3268: cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype);
3269: cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype);
3270: PetscCallCUSPARSE(
3271: cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
3272: cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize);
3274: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3275: } else {
3276: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3277: cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr);
3278: cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr);
3279: }
3281: PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3282: matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3283: #else
3284: CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3285: cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr);
3286: #endif
3287: } else {
3288: if (cusparsestruct->nrows) {
3289: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3290: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3291: #else
3292: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3293: cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr);
3294: #endif
3295: }
3296: }
3297: PetscLogGpuTimeEnd();
3299: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3300: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3301: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3302: VecCopy_SeqCUDA(yy, zz); /* zz = yy */
3303: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3304: VecAXPY_SeqCUDA(zz, 1.0, yy); /* zz += yy */
3305: }
3306: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3307: VecSet_SeqCUDA(zz, 0);
3308: }
3310: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3311: if (compressed) {
3312: PetscLogGpuTimeBegin();
3313: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3314: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3315: prevent that. So I just add a ScatterAdd kernel.
3316: */
3317: #if 0
3318: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3319: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3320: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3321: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3322: VecCUDAPlusEquals());
3323: #else
3324: PetscInt n = matstruct->cprowIndices->size();
3325: ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3326: #endif
3327: PetscLogGpuTimeEnd();
3328: }
3329: } else {
3330: if (yy && yy != zz) { VecAXPY_SeqCUDA(zz, 1.0, yy); /* zz += yy */ }
3331: }
3332: VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray);
3333: if (yy == zz) VecCUDARestoreArray(zz, &zarray);
3334: else VecCUDARestoreArrayWrite(zz, &zarray);
3335: } catch (char *ex) {
3336: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3337: }
3338: if (yy) {
3339: PetscLogGpuFlops(2.0 * a->nz);
3340: } else {
3341: PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt);
3342: }
3343: return 0;
3344: }
3346: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3347: {
3348: MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE);
3349: return 0;
3350: }
3352: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3353: {
3354: PetscObjectState onnz = A->nonzerostate;
3355: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3357: MatAssemblyEnd_SeqAIJ(A, mode);
3358: if (onnz != A->nonzerostate && cusp->deviceMat) {
3359: PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
3360: cudaFree(cusp->deviceMat);
3361: cusp->deviceMat = NULL;
3362: }
3363: return 0;
3364: }
3366: /* --------------------------------------------------------------------------------*/
3367: /*@
3368: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3369: (the default parallel PETSc format). This matrix will ultimately pushed down
3370: to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3371: assembly performance the user should preallocate the matrix storage by setting
3372: the parameter nz (or the array nnz). By setting these parameters accurately,
3373: performance during matrix assembly can be increased by more than a factor of 50.
3375: Collective
3377: Input Parameters:
3378: + comm - MPI communicator, set to `PETSC_COMM_SELF`
3379: . m - number of rows
3380: . n - number of columns
3381: . nz - number of nonzeros per row (same for all rows)
3382: - nnz - array containing the number of nonzeros in the various rows
3383: (possibly different for each row) or NULL
3385: Output Parameter:
3386: . A - the matrix
3388: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3389: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3390: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3392: Notes:
3393: If nnz is given then nz is ignored
3395: The AIJ format, also called
3396: compressed row storage, is fully compatible with standard Fortran 77
3397: storage. That is, the stored row and column indices can begin at
3398: either one (as in Fortran) or zero. See the users' manual for details.
3400: Specify the preallocated storage with either nz or nnz (not both).
3401: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3402: allocation. For large problems you MUST preallocate memory or you
3403: will get TERRIBLE performance, see the users' manual chapter on matrices.
3405: By default, this format uses inodes (identical nodes) when possible, to
3406: improve numerical efficiency of matrix-vector products and solves. We
3407: search for consecutive rows with the same nonzero structure, thereby
3408: reusing matrix information to achieve increased efficiency.
3410: Level: intermediate
3412: .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3413: @*/
3414: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3415: {
3416: MatCreate(comm, A);
3417: MatSetSizes(*A, m, n, m, n);
3418: MatSetType(*A, MATSEQAIJCUSPARSE);
3419: MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz);
3420: return 0;
3421: }
3423: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3424: {
3425: if (A->factortype == MAT_FACTOR_NONE) {
3426: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr);
3427: } else {
3428: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr);
3429: }
3430: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3431: PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL);
3432: PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL);
3433: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL);
3434: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL);
3435: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL);
3436: PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL);
3437: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3438: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3439: PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL);
3440: MatDestroy_SeqAIJ(A);
3441: return 0;
3442: }
3444: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
3445: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3446: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3447: {
3448: MatDuplicate_SeqAIJ(A, cpvalues, B);
3449: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B);
3450: return 0;
3451: }
3453: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3454: {
3455: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3456: Mat_SeqAIJCUSPARSE *cy;
3457: Mat_SeqAIJCUSPARSE *cx;
3458: PetscScalar *ay;
3459: const PetscScalar *ax;
3460: CsrMatrix *csry, *csrx;
3462: cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3463: cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3464: if (X->ops->axpy != Y->ops->axpy) {
3465: MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3466: MatAXPY_SeqAIJ(Y, a, X, str);
3467: return 0;
3468: }
3469: /* if we are here, it means both matrices are bound to GPU */
3470: MatSeqAIJCUSPARSECopyToGPU(Y);
3471: MatSeqAIJCUSPARSECopyToGPU(X);
3474: csry = (CsrMatrix *)cy->mat->mat;
3475: csrx = (CsrMatrix *)cx->mat->mat;
3476: /* see if we can turn this into a cublas axpy */
3477: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3478: bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3479: if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3480: if (eq) str = SAME_NONZERO_PATTERN;
3481: }
3482: /* spgeam is buggy with one column */
3483: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3485: if (str == SUBSET_NONZERO_PATTERN) {
3486: PetscScalar b = 1.0;
3487: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3488: size_t bufferSize;
3489: void *buffer;
3490: #endif
3492: MatSeqAIJCUSPARSEGetArrayRead(X, &ax);
3493: MatSeqAIJCUSPARSEGetArray(Y, &ay);
3494: cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);
3495: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3496: PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3497: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
3498: cudaMalloc(&buffer, bufferSize);
3499: PetscLogGpuTimeBegin();
3500: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3501: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
3502: PetscLogGpuFlops(x->nz + y->nz);
3503: PetscLogGpuTimeEnd();
3504: cudaFree(buffer);
3505: #else
3506: PetscLogGpuTimeBegin();
3507: PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
3508: csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
3509: PetscLogGpuFlops(x->nz + y->nz);
3510: PetscLogGpuTimeEnd();
3511: #endif
3512: cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);
3513: MatSeqAIJCUSPARSERestoreArrayRead(X, &ax);
3514: MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3515: MatSeqAIJInvalidateDiagonal(Y);
3516: } else if (str == SAME_NONZERO_PATTERN) {
3517: cublasHandle_t cublasv2handle;
3518: PetscBLASInt one = 1, bnz = 1;
3520: MatSeqAIJCUSPARSEGetArrayRead(X, &ax);
3521: MatSeqAIJCUSPARSEGetArray(Y, &ay);
3522: PetscCUBLASGetHandle(&cublasv2handle);
3523: PetscBLASIntCast(x->nz, &bnz);
3524: PetscLogGpuTimeBegin();
3525: cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one);
3526: PetscLogGpuFlops(2.0 * bnz);
3527: PetscLogGpuTimeEnd();
3528: MatSeqAIJCUSPARSERestoreArrayRead(X, &ax);
3529: MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3530: MatSeqAIJInvalidateDiagonal(Y);
3531: } else {
3532: MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE);
3533: MatAXPY_SeqAIJ(Y, a, X, str);
3534: }
3535: return 0;
3536: }
3538: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3539: {
3540: Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
3541: PetscScalar *ay;
3542: cublasHandle_t cublasv2handle;
3543: PetscBLASInt one = 1, bnz = 1;
3545: MatSeqAIJCUSPARSEGetArray(Y, &ay);
3546: PetscCUBLASGetHandle(&cublasv2handle);
3547: PetscBLASIntCast(y->nz, &bnz);
3548: PetscLogGpuTimeBegin();
3549: cublasXscal(cublasv2handle, bnz, &a, ay, one);
3550: PetscLogGpuFlops(bnz);
3551: PetscLogGpuTimeEnd();
3552: MatSeqAIJCUSPARSERestoreArray(Y, &ay);
3553: MatSeqAIJInvalidateDiagonal(Y);
3554: return 0;
3555: }
3557: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3558: {
3559: PetscBool both = PETSC_FALSE;
3560: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3562: if (A->factortype == MAT_FACTOR_NONE) {
3563: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
3564: if (spptr->mat) {
3565: CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
3566: if (matrix->values) {
3567: both = PETSC_TRUE;
3568: thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3569: }
3570: }
3571: if (spptr->matTranspose) {
3572: CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3573: if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3574: }
3575: }
3576: PetscArrayzero(a->a, a->i[A->rmap->n]);
3577: MatSeqAIJInvalidateDiagonal(A);
3578: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3579: else A->offloadmask = PETSC_OFFLOAD_CPU;
3580: return 0;
3581: }
3583: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3584: {
3585: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3587: if (A->factortype != MAT_FACTOR_NONE) {
3588: A->boundtocpu = flg;
3589: return 0;
3590: }
3591: if (flg) {
3592: MatSeqAIJCUSPARSECopyFromGPU(A);
3594: A->ops->scale = MatScale_SeqAIJ;
3595: A->ops->axpy = MatAXPY_SeqAIJ;
3596: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3597: A->ops->mult = MatMult_SeqAIJ;
3598: A->ops->multadd = MatMultAdd_SeqAIJ;
3599: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3600: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3601: A->ops->multhermitiantranspose = NULL;
3602: A->ops->multhermitiantransposeadd = NULL;
3603: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3604: PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps));
3605: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL);
3606: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL);
3607: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL);
3608: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
3609: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
3610: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL);
3611: } else {
3612: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3613: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3614: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3615: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3616: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3617: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3618: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3619: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3620: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3621: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3622: a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3623: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3624: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3625: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3626: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3627: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3628: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3630: PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3631: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3632: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3633: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE);
3634: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE);
3635: PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE);
3636: }
3637: A->boundtocpu = flg;
3638: if (flg && a->inode.size) {
3639: a->inode.use = PETSC_TRUE;
3640: } else {
3641: a->inode.use = PETSC_FALSE;
3642: }
3643: return 0;
3644: }
3646: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3647: {
3648: Mat B;
3650: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* first use of CUSPARSE may be via MatConvert */
3651: if (reuse == MAT_INITIAL_MATRIX) {
3652: MatDuplicate(A, MAT_COPY_VALUES, newmat);
3653: } else if (reuse == MAT_REUSE_MATRIX) {
3654: MatCopy(A, *newmat, SAME_NONZERO_PATTERN);
3655: }
3656: B = *newmat;
3658: PetscFree(B->defaultvectype);
3659: PetscStrallocpy(VECCUDA, &B->defaultvectype);
3661: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3662: if (B->factortype == MAT_FACTOR_NONE) {
3663: Mat_SeqAIJCUSPARSE *spptr;
3664: PetscNew(&spptr);
3665: cusparseCreate(&spptr->handle);
3666: cusparseSetStream(spptr->handle, PetscDefaultCudaStream);
3667: spptr->format = MAT_CUSPARSE_CSR;
3668: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3669: #if CUSPARSE_VERSION > 11301
3670: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3671: #else
3672: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3673: #endif
3674: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3675: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3676: #endif
3677: B->spptr = spptr;
3678: } else {
3679: Mat_SeqAIJCUSPARSETriFactors *spptr;
3681: PetscNew(&spptr);
3682: cusparseCreate(&spptr->handle);
3683: cusparseSetStream(spptr->handle, PetscDefaultCudaStream);
3684: B->spptr = spptr;
3685: }
3686: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3687: }
3688: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3689: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3690: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3691: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3692: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3693: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3695: MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE);
3696: PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE);
3697: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3698: #if defined(PETSC_HAVE_HYPRE)
3699: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE);
3700: #endif
3701: PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);
3702: return 0;
3703: }
3705: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3706: {
3707: MatCreate_SeqAIJ(B);
3708: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B);
3709: return 0;
3710: }
3712: /*MC
3713: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3715: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3716: CSR, ELL, or Hybrid format.
3717: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3719: Options Database Keys:
3720: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3721: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3722: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3723: + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3725: Level: beginner
3727: .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3728: M*/
3730: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
3732: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3733: {
3734: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band);
3735: MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse);
3736: MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse);
3737: MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse);
3738: MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse);
3740: return 0;
3741: }
3743: static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3744: {
3745: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3747: if (!cusp) return 0;
3748: delete cusp->cooPerm;
3749: delete cusp->cooPerm_a;
3750: cusp->cooPerm = NULL;
3751: cusp->cooPerm_a = NULL;
3752: if (cusp->use_extended_coo) {
3753: cudaFree(cusp->jmap_d);
3754: cudaFree(cusp->perm_d);
3755: }
3756: cusp->use_extended_coo = PETSC_FALSE;
3757: return 0;
3758: }
3760: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3761: {
3762: if (*cusparsestruct) {
3763: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format);
3764: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format);
3765: delete (*cusparsestruct)->workVector;
3766: delete (*cusparsestruct)->rowoffsets_gpu;
3767: delete (*cusparsestruct)->cooPerm;
3768: delete (*cusparsestruct)->cooPerm_a;
3769: delete (*cusparsestruct)->csr2csc_i;
3770: if ((*cusparsestruct)->handle) cusparseDestroy((*cusparsestruct)->handle);
3771: if ((*cusparsestruct)->jmap_d) cudaFree((*cusparsestruct)->jmap_d);
3772: if ((*cusparsestruct)->perm_d) cudaFree((*cusparsestruct)->perm_d);
3773: PetscFree(*cusparsestruct);
3774: }
3775: return 0;
3776: }
3778: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3779: {
3780: if (*mat) {
3781: delete (*mat)->values;
3782: delete (*mat)->column_indices;
3783: delete (*mat)->row_offsets;
3784: delete *mat;
3785: *mat = 0;
3786: }
3787: return 0;
3788: }
3790: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3791: {
3792: if (*trifactor) {
3793: if ((*trifactor)->descr) cusparseDestroyMatDescr((*trifactor)->descr);
3794: if ((*trifactor)->solveInfo) cusparseDestroyCsrsvInfo((*trifactor)->solveInfo);
3795: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3796: if ((*trifactor)->solveBuffer) cudaFree((*trifactor)->solveBuffer);
3797: if ((*trifactor)->AA_h) cudaFreeHost((*trifactor)->AA_h);
3798: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3799: if ((*trifactor)->csr2cscBuffer) cudaFree((*trifactor)->csr2cscBuffer);
3800: #endif
3801: PetscFree(*trifactor);
3802: }
3803: return 0;
3804: }
3806: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3807: {
3808: CsrMatrix *mat;
3810: if (*matstruct) {
3811: if ((*matstruct)->mat) {
3812: if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3813: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3814: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3815: #else
3816: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3817: cusparseDestroyHybMat(hybMat);
3818: #endif
3819: } else {
3820: mat = (CsrMatrix *)(*matstruct)->mat;
3821: CsrMatrix_Destroy(&mat);
3822: }
3823: }
3824: if ((*matstruct)->descr) cusparseDestroyMatDescr((*matstruct)->descr);
3825: delete (*matstruct)->cprowIndices;
3826: if ((*matstruct)->alpha_one) cudaFree((*matstruct)->alpha_one);
3827: if ((*matstruct)->beta_zero) cudaFree((*matstruct)->beta_zero);
3828: if ((*matstruct)->beta_one) cudaFree((*matstruct)->beta_one);
3830: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3831: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3832: if (mdata->matDescr) cusparseDestroySpMat(mdata->matDescr);
3833: for (int i = 0; i < 3; i++) {
3834: if (mdata->cuSpMV[i].initialized) {
3835: cudaFree(mdata->cuSpMV[i].spmvBuffer);
3836: cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);
3837: cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);
3838: }
3839: }
3840: #endif
3841: delete *matstruct;
3842: *matstruct = NULL;
3843: }
3844: return 0;
3845: }
3847: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3848: {
3849: Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3851: if (fs) {
3852: MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr);
3853: MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr);
3854: MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose);
3855: MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose);
3856: delete fs->rpermIndices;
3857: delete fs->cpermIndices;
3858: delete fs->workVector;
3859: fs->rpermIndices = NULL;
3860: fs->cpermIndices = NULL;
3861: fs->workVector = NULL;
3862: if (fs->a_band_d) cudaFree(fs->a_band_d);
3863: if (fs->i_band_d) cudaFree(fs->i_band_d);
3864: fs->init_dev_prop = PETSC_FALSE;
3865: #if CUSPARSE_VERSION >= 11500
3866: cudaFree(fs->csrRowPtr);
3867: cudaFree(fs->csrColIdx);
3868: cudaFree(fs->csrVal);
3869: cudaFree(fs->X);
3870: cudaFree(fs->Y);
3871: // cudaFree(fs->factBuffer_M); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3872: cudaFree(fs->spsvBuffer_L);
3873: cudaFree(fs->spsvBuffer_U);
3874: cudaFree(fs->spsvBuffer_Lt);
3875: cudaFree(fs->spsvBuffer_Ut);
3876: cusparseDestroyMatDescr(fs->matDescr_M);
3877: cusparseDestroySpMat(fs->spMatDescr_L);
3878: cusparseDestroySpMat(fs->spMatDescr_U);
3879: cusparseSpSV_destroyDescr(fs->spsvDescr_L);
3880: cusparseSpSV_destroyDescr(fs->spsvDescr_Lt);
3881: cusparseSpSV_destroyDescr(fs->spsvDescr_U);
3882: cusparseSpSV_destroyDescr(fs->spsvDescr_Ut);
3883: cusparseDestroyDnVec(fs->dnVecDescr_X);
3884: cusparseDestroyDnVec(fs->dnVecDescr_Y);
3885: cusparseDestroyCsrilu02Info(fs->ilu0Info_M);
3886: cusparseDestroyCsric02Info(fs->ic0Info_M);
3888: fs->createdTransposeSpSVDescr = PETSC_FALSE;
3889: fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3890: #endif
3891: }
3892: return 0;
3893: }
3895: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3896: {
3897: cusparseHandle_t handle;
3899: if (*trifactors) {
3900: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3901: if (handle = (*trifactors)->handle) cusparseDestroy(handle);
3902: PetscFree(*trifactors);
3903: }
3904: return 0;
3905: }
3907: struct IJCompare {
3908: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3909: {
3910: if (t1.get<0>() < t2.get<0>()) return true;
3911: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3912: return false;
3913: }
3914: };
3916: struct IJEqual {
3917: __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3918: {
3919: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3920: return true;
3921: }
3922: };
3924: struct IJDiff {
3925: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
3926: };
3928: struct IJSum {
3929: __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
3930: };
3932: #include <thrust/iterator/discard_iterator.h>
3933: /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3934: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3935: {
3936: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
3937: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3938: THRUSTARRAY *cooPerm_v = NULL;
3939: thrust::device_ptr<const PetscScalar> d_v;
3940: CsrMatrix *matrix;
3941: PetscInt n;
3945: if (!cusp->cooPerm) {
3946: MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
3947: MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
3948: return 0;
3949: }
3950: matrix = (CsrMatrix *)cusp->mat->mat;
3952: if (!v) {
3953: if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
3954: goto finalize;
3955: }
3956: n = cusp->cooPerm->size();
3957: if (isCudaMem(v)) {
3958: d_v = thrust::device_pointer_cast(v);
3959: } else {
3960: cooPerm_v = new THRUSTARRAY(n);
3961: cooPerm_v->assign(v, v + n);
3962: d_v = cooPerm_v->data();
3963: PetscLogCpuToGpu(n * sizeof(PetscScalar));
3964: }
3965: PetscLogGpuTimeBegin();
3966: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3967: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3968: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3969: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3970: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3971: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3972: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3973: */
3974: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3975: thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
3976: delete cooPerm_w;
3977: } else {
3978: /* all nonzeros in d_v[] are unique entries */
3979: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3980: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3981: thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3982: }
3983: } else {
3984: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3985: auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
3986: thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
3987: } else {
3988: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
3989: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
3990: thrust::for_each(zibit, zieit, VecCUDAEquals());
3991: }
3992: }
3993: PetscLogGpuTimeEnd();
3994: finalize:
3995: delete cooPerm_v;
3996: A->offloadmask = PETSC_OFFLOAD_GPU;
3997: PetscObjectStateIncrease((PetscObject)A);
3998: /* shorter version of MatAssemblyEnd_SeqAIJ */
3999: PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz);
4000: PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n");
4001: PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax);
4002: a->reallocs = 0;
4003: A->info.mallocs += 0;
4004: A->info.nz_unneeded = 0;
4005: A->assembled = A->was_assembled = PETSC_TRUE;
4006: A->num_ass++;
4007: return 0;
4008: }
4010: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4011: {
4012: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4015: if (!cusp) return 0;
4016: if (destroy) {
4017: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format);
4018: delete cusp->csr2csc_i;
4019: cusp->csr2csc_i = NULL;
4020: }
4021: A->transupdated = PETSC_FALSE;
4022: return 0;
4023: }
4025: #include <thrust/binary_search.h>
4026: /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4027: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4028: {
4029: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4030: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4031: PetscInt cooPerm_n, nzr = 0;
4033: PetscLayoutSetUp(A->rmap);
4034: PetscLayoutSetUp(A->cmap);
4035: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4036: if (n != cooPerm_n) {
4037: delete cusp->cooPerm;
4038: delete cusp->cooPerm_a;
4039: cusp->cooPerm = NULL;
4040: cusp->cooPerm_a = NULL;
4041: }
4042: if (n) {
4043: thrust::device_ptr<PetscInt> d_i, d_j;
4044: PetscInt *d_raw_i, *d_raw_j;
4045: PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4046: PetscMemType imtype, jmtype;
4048: PetscGetMemType(coo_i, &imtype);
4049: if (PetscMemTypeHost(imtype)) {
4050: cudaMalloc(&d_raw_i, sizeof(PetscInt) * n);
4051: cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
4052: d_i = thrust::device_pointer_cast(d_raw_i);
4053: free_raw_i = PETSC_TRUE;
4054: PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
4055: } else {
4056: d_i = thrust::device_pointer_cast(coo_i);
4057: }
4059: PetscGetMemType(coo_j, &jmtype);
4060: if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4061: cudaMalloc(&d_raw_j, sizeof(PetscInt) * n);
4062: cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice);
4063: d_j = thrust::device_pointer_cast(d_raw_j);
4064: free_raw_j = PETSC_TRUE;
4065: PetscLogCpuToGpu(1. * n * sizeof(PetscInt));
4066: } else {
4067: d_j = thrust::device_pointer_cast(coo_j);
4068: }
4070: THRUSTINTARRAY ii(A->rmap->n);
4072: if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4073: if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
4075: /* Ex.
4076: n = 6
4077: coo_i = [3,3,1,4,1,4]
4078: coo_j = [3,2,2,5,2,6]
4079: */
4080: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4081: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
4083: PetscLogGpuTimeBegin();
4084: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4085: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4086: (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */
4087: THRUSTINTARRAY w(d_j, d_j + n);
4089: /*
4090: d_i = [1,1,3,3,4,4]
4091: d_j = [2,2,2,3,5,6]
4092: cooPerm = [2,4,1,0,3,5]
4093: */
4094: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4096: /*
4097: d_i = [1,3,3,4,4,x]
4098: ^ekey
4099: d_j = [2,2,3,5,6,x]
4100: ^nekye
4101: */
4102: if (nekey == ekey) { /* all entries are unique */
4103: delete cusp->cooPerm_a;
4104: cusp->cooPerm_a = NULL;
4105: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4106: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4107: adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4108: adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4109: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4110: w[0] = 0;
4111: thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
4112: thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4113: }
4114: thrust::counting_iterator<PetscInt> search_begin(0);
4115: thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4116: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4117: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4118: PetscLogGpuTimeEnd();
4120: MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i);
4121: a->singlemalloc = PETSC_FALSE;
4122: a->free_a = PETSC_TRUE;
4123: a->free_ij = PETSC_TRUE;
4124: PetscMalloc1(A->rmap->n + 1, &a->i);
4125: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4126: cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4127: a->nz = a->maxnz = a->i[A->rmap->n];
4128: a->rmax = 0;
4129: PetscMalloc1(a->nz, &a->a);
4130: PetscMalloc1(a->nz, &a->j);
4131: cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4132: if (!a->ilen) PetscMalloc1(A->rmap->n, &a->ilen);
4133: if (!a->imax) PetscMalloc1(A->rmap->n, &a->imax);
4134: for (PetscInt i = 0; i < A->rmap->n; i++) {
4135: const PetscInt nnzr = a->i[i + 1] - a->i[i];
4136: nzr += (PetscInt) !!(nnzr);
4137: a->ilen[i] = a->imax[i] = nnzr;
4138: a->rmax = PetscMax(a->rmax, nnzr);
4139: }
4140: a->nonzerorowcnt = nzr;
4141: A->preallocated = PETSC_TRUE;
4142: PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt));
4143: MatMarkDiagonal_SeqAIJ(A);
4144: if (free_raw_i) cudaFree(d_raw_i);
4145: if (free_raw_j) cudaFree(d_raw_j);
4146: } else {
4147: MatSeqAIJSetPreallocation(A, 0, NULL);
4148: }
4149: MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
4151: /* We want to allocate the CUSPARSE struct for matvec now.
4152: The code is so convoluted now that I prefer to copy zeros */
4153: PetscArrayzero(a->a, a->nz);
4154: MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6);
4155: A->offloadmask = PETSC_OFFLOAD_CPU;
4156: MatSeqAIJCUSPARSECopyToGPU(A);
4157: MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE);
4158: return 0;
4159: }
4161: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4162: {
4163: Mat_SeqAIJ *seq;
4164: Mat_SeqAIJCUSPARSE *dev;
4165: PetscBool coo_basic = PETSC_TRUE;
4166: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
4168: MatResetPreallocationCOO_SeqAIJ(mat);
4169: MatResetPreallocationCOO_SeqAIJCUSPARSE(mat);
4170: if (coo_i) {
4171: PetscGetMemType(coo_i, &mtype);
4172: if (PetscMemTypeHost(mtype)) {
4173: for (PetscCount k = 0; k < coo_n; k++) {
4174: if (coo_i[k] < 0 || coo_j[k] < 0) {
4175: coo_basic = PETSC_FALSE;
4176: break;
4177: }
4178: }
4179: }
4180: }
4182: if (coo_basic) { /* i,j are on device or do not contain negative indices */
4183: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j);
4184: } else {
4185: MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j);
4186: mat->offloadmask = PETSC_OFFLOAD_CPU;
4187: MatSeqAIJCUSPARSECopyToGPU(mat);
4188: seq = static_cast<Mat_SeqAIJ *>(mat->data);
4189: dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4190: cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount));
4191: cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
4192: cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount));
4193: cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice);
4194: dev->use_extended_coo = PETSC_TRUE;
4195: }
4196: return 0;
4197: }
4199: __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4200: {
4201: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
4202: const PetscCount grid_size = gridDim.x * blockDim.x;
4203: for (; i < nnz; i += grid_size) {
4204: PetscScalar sum = 0.0;
4205: for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4206: a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4207: }
4208: }
4210: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4211: {
4212: Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data;
4213: Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr;
4214: PetscCount Annz = seq->nz;
4215: PetscMemType memtype;
4216: const PetscScalar *v1 = v;
4217: PetscScalar *Aa;
4219: if (dev->use_extended_coo) {
4220: PetscGetMemType(v, &memtype);
4221: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4222: cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar));
4223: cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
4224: }
4226: if (imode == INSERT_VALUES) MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa);
4227: else MatSeqAIJCUSPARSEGetArray(A, &Aa);
4229: if (Annz) {
4230: MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
4231: cudaPeekAtLastError();
4232: }
4234: if (imode == INSERT_VALUES) MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa);
4235: else MatSeqAIJCUSPARSERestoreArray(A, &Aa);
4237: if (PetscMemTypeHost(memtype)) cudaFree((void *)v1);
4238: } else {
4239: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode);
4240: }
4241: return 0;
4242: }
4244: /*@C
4245: MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
4247: Not collective
4249: Input Parameters:
4250: + A - the matrix
4251: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4253: Output Parameters:
4254: + ia - the CSR row pointers
4255: - ja - the CSR column indices
4257: Level: developer
4259: Note:
4260: When compressed is true, the CSR structure does not contain empty rows
4262: .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4263: @*/
4264: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4265: {
4266: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4267: CsrMatrix *csr;
4268: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4271: if (!i || !j) return 0;
4274: MatSeqAIJCUSPARSECopyToGPU(A);
4276: csr = (CsrMatrix *)cusp->mat->mat;
4277: if (i) {
4278: if (!compressed && a->compressedrow.use) { /* need full row offset */
4279: if (!cusp->rowoffsets_gpu) {
4280: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4281: cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4282: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4283: }
4284: *i = cusp->rowoffsets_gpu->data().get();
4285: } else *i = csr->row_offsets->data().get();
4286: }
4287: if (j) *j = csr->column_indices->data().get();
4288: return 0;
4289: }
4291: /*@C
4292: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
4294: Not collective
4296: Input Parameters:
4297: + A - the matrix
4298: - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
4300: Output Parameters:
4301: + ia - the CSR row pointers
4302: - ja - the CSR column indices
4304: Level: developer
4306: .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4307: @*/
4308: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4309: {
4312: if (i) *i = NULL;
4313: if (j) *j = NULL;
4314: return 0;
4315: }
4317: /*@C
4318: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4320: Not Collective
4322: Input Parameter:
4323: . A - a `MATSEQAIJCUSPARSE` matrix
4325: Output Parameter:
4326: . a - pointer to the device data
4328: Level: developer
4330: Note:
4331: May trigger host-device copies if up-to-date matrix data is on host
4333: .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4334: @*/
4335: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4336: {
4337: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4338: CsrMatrix *csr;
4344: MatSeqAIJCUSPARSECopyToGPU(A);
4346: csr = (CsrMatrix *)cusp->mat->mat;
4348: *a = csr->values->data().get();
4349: return 0;
4350: }
4352: /*@C
4353: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
4355: Not Collective
4357: Input Parameter:
4358: . A - a `MATSEQAIJCUSPARSE` matrix
4360: Output Parameter:
4361: . a - pointer to the device data
4363: Level: developer
4365: .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4366: @*/
4367: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4368: {
4372: *a = NULL;
4373: return 0;
4374: }
4376: /*@C
4377: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4379: Not Collective
4381: Input Parameter:
4382: . A - a `MATSEQAIJCUSPARSE` matrix
4384: Output Parameter:
4385: . a - pointer to the device data
4387: Level: developer
4389: Note:
4390: May trigger host-device copies if up-to-date matrix data is on host
4392: .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4393: @*/
4394: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4395: {
4396: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397: CsrMatrix *csr;
4403: MatSeqAIJCUSPARSECopyToGPU(A);
4405: csr = (CsrMatrix *)cusp->mat->mat;
4407: *a = csr->values->data().get();
4408: A->offloadmask = PETSC_OFFLOAD_GPU;
4409: MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
4410: return 0;
4411: }
4412: /*@C
4413: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4415: Not Collective
4417: Input Parameter:
4418: . A - a `MATSEQAIJCUSPARSE` matrix
4420: Output Parameter:
4421: . a - pointer to the device data
4423: Level: developer
4425: .seealso: `MatSeqAIJCUSPARSEGetArray()`
4426: @*/
4427: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4428: {
4432: MatSeqAIJInvalidateDiagonal(A);
4433: PetscObjectStateIncrease((PetscObject)A);
4434: *a = NULL;
4435: return 0;
4436: }
4438: /*@C
4439: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
4441: Not Collective
4443: Input Parameter:
4444: . A - a `MATSEQAIJCUSPARSE` matrix
4446: Output Parameter:
4447: . a - pointer to the device data
4449: Level: developer
4451: Note:
4452: Does not trigger host-device copies and flags data validity on the GPU
4454: .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4455: @*/
4456: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4457: {
4458: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4459: CsrMatrix *csr;
4466: csr = (CsrMatrix *)cusp->mat->mat;
4468: *a = csr->values->data().get();
4469: A->offloadmask = PETSC_OFFLOAD_GPU;
4470: MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE);
4471: return 0;
4472: }
4474: /*@C
4475: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
4477: Not Collective
4479: Input Parameter:
4480: . A - a `MATSEQAIJCUSPARSE` matrix
4482: Output Parameter:
4483: . a - pointer to the device data
4485: Level: developer
4487: .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4488: @*/
4489: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4490: {
4494: MatSeqAIJInvalidateDiagonal(A);
4495: PetscObjectStateIncrease((PetscObject)A);
4496: *a = NULL;
4497: return 0;
4498: }
4500: struct IJCompare4 {
4501: __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4502: {
4503: if (t1.get<0>() < t2.get<0>()) return true;
4504: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4505: return false;
4506: }
4507: };
4509: struct Shift {
4510: int _shift;
4512: Shift(int shift) : _shift(shift) { }
4513: __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4514: };
4516: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4517: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4518: {
4519: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4520: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4521: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4522: CsrMatrix *Acsr, *Bcsr, *Ccsr;
4523: PetscInt Annz, Bnnz;
4524: cusparseStatus_t stat;
4525: PetscInt i, m, n, zero = 0;
4536: if (reuse == MAT_INITIAL_MATRIX) {
4537: m = A->rmap->n;
4538: n = A->cmap->n + B->cmap->n;
4539: MatCreate(PETSC_COMM_SELF, C);
4540: MatSetSizes(*C, m, n, m, n);
4541: MatSetType(*C, MATSEQAIJCUSPARSE);
4542: c = (Mat_SeqAIJ *)(*C)->data;
4543: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4544: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4545: Ccsr = new CsrMatrix;
4546: Cmat->cprowIndices = NULL;
4547: c->compressedrow.use = PETSC_FALSE;
4548: c->compressedrow.nrows = 0;
4549: c->compressedrow.i = NULL;
4550: c->compressedrow.rindex = NULL;
4551: Ccusp->workVector = NULL;
4552: Ccusp->nrows = m;
4553: Ccusp->mat = Cmat;
4554: Ccusp->mat->mat = Ccsr;
4555: Ccsr->num_rows = m;
4556: Ccsr->num_cols = n;
4557: cusparseCreateMatDescr(&Cmat->descr);
4558: cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);
4559: cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4560: cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar));
4561: cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar));
4562: cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));
4563: cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4564: cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4565: cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4566: MatSeqAIJCUSPARSECopyToGPU(A);
4567: MatSeqAIJCUSPARSECopyToGPU(B);
4571: Acsr = (CsrMatrix *)Acusp->mat->mat;
4572: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4573: Annz = (PetscInt)Acsr->column_indices->size();
4574: Bnnz = (PetscInt)Bcsr->column_indices->size();
4575: c->nz = Annz + Bnnz;
4576: Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1);
4577: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4578: Ccsr->values = new THRUSTARRAY(c->nz);
4579: Ccsr->num_entries = c->nz;
4580: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4581: if (c->nz) {
4582: auto Acoo = new THRUSTINTARRAY32(Annz);
4583: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4584: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4585: THRUSTINTARRAY32 *Aroff, *Broff;
4587: if (a->compressedrow.use) { /* need full row offset */
4588: if (!Acusp->rowoffsets_gpu) {
4589: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4590: Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
4591: PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt));
4592: }
4593: Aroff = Acusp->rowoffsets_gpu;
4594: } else Aroff = Acsr->row_offsets;
4595: if (b->compressedrow.use) { /* need full row offset */
4596: if (!Bcusp->rowoffsets_gpu) {
4597: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4598: Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
4599: PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt));
4600: }
4601: Broff = Bcusp->rowoffsets_gpu;
4602: } else Broff = Bcsr->row_offsets;
4603: PetscLogGpuTimeBegin();
4604: stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4605: stat;
4606: stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4607: stat;
4608: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4609: auto Aperm = thrust::make_constant_iterator(1);
4610: auto Bperm = thrust::make_constant_iterator(0);
4611: #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4612: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4613: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
4614: #else
4615: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4616: auto Bcib = Bcsr->column_indices->begin();
4617: auto Bcie = Bcsr->column_indices->end();
4618: thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
4619: #endif
4620: auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
4621: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
4622: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
4623: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
4624: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
4625: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4626: auto p1 = Ccusp->cooPerm->begin();
4627: auto p2 = Ccusp->cooPerm->begin();
4628: thrust::advance(p2, Annz);
4629: thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4());
4630: #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
4631: thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
4632: #endif
4633: auto cci = thrust::make_counting_iterator(zero);
4634: auto cce = thrust::make_counting_iterator(c->nz);
4635: #if 0 //Errors on SUMMIT cuda 11.1.0
4636: thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>());
4637: #else
4638: auto pred = thrust::identity<int>();
4639: thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred);
4640: thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred);
4641: #endif
4642: stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
4643: stat;
4644: PetscLogGpuTimeEnd();
4645: delete wPerm;
4646: delete Acoo;
4647: delete Bcoo;
4648: delete Ccoo;
4649: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4650: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4651: stat;
4652: #endif
4653: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4654: MatSeqAIJCUSPARSEFormExplicitTranspose(A);
4655: MatSeqAIJCUSPARSEFormExplicitTranspose(B);
4656: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4657: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4658: CsrMatrix *CcsrT = new CsrMatrix;
4659: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4660: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4662: (*C)->form_explicit_transpose = PETSC_TRUE;
4663: (*C)->transupdated = PETSC_TRUE;
4664: Ccusp->rowoffsets_gpu = NULL;
4665: CmatT->cprowIndices = NULL;
4666: CmatT->mat = CcsrT;
4667: CcsrT->num_rows = n;
4668: CcsrT->num_cols = m;
4669: CcsrT->num_entries = c->nz;
4671: CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1);
4672: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4673: CcsrT->values = new THRUSTARRAY(c->nz);
4675: PetscLogGpuTimeBegin();
4676: auto rT = CcsrT->row_offsets->begin();
4677: if (AT) {
4678: rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4679: thrust::advance(rT, -1);
4680: }
4681: if (BT) {
4682: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4683: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4684: thrust::copy(titb, tite, rT);
4685: }
4686: auto cT = CcsrT->column_indices->begin();
4687: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4688: if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4689: auto vT = CcsrT->values->begin();
4690: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4691: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4692: PetscLogGpuTimeEnd();
4694: cusparseCreateMatDescr(&CmatT->descr);
4695: cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);
4696: cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);
4697: cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar));
4698: cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar));
4699: cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));
4700: cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4701: cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4702: cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice);
4703: #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4704: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
4705: stat;
4706: #endif
4707: Ccusp->matTranspose = CmatT;
4708: }
4709: }
4711: c->singlemalloc = PETSC_FALSE;
4712: c->free_a = PETSC_TRUE;
4713: c->free_ij = PETSC_TRUE;
4714: PetscMalloc1(m + 1, &c->i);
4715: PetscMalloc1(c->nz, &c->j);
4716: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4717: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4718: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4719: ii = *Ccsr->row_offsets;
4720: jj = *Ccsr->column_indices;
4721: cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4722: cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4723: } else {
4724: cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4725: cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost);
4726: }
4727: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt));
4728: PetscMalloc1(m, &c->ilen);
4729: PetscMalloc1(m, &c->imax);
4730: c->maxnz = c->nz;
4731: c->nonzerorowcnt = 0;
4732: c->rmax = 0;
4733: for (i = 0; i < m; i++) {
4734: const PetscInt nn = c->i[i + 1] - c->i[i];
4735: c->ilen[i] = c->imax[i] = nn;
4736: c->nonzerorowcnt += (PetscInt) !!nn;
4737: c->rmax = PetscMax(c->rmax, nn);
4738: }
4739: MatMarkDiagonal_SeqAIJ(*C);
4740: PetscMalloc1(c->nz, &c->a);
4741: (*C)->nonzerostate++;
4742: PetscLayoutSetUp((*C)->rmap);
4743: PetscLayoutSetUp((*C)->cmap);
4744: Ccusp->nonzerostate = (*C)->nonzerostate;
4745: (*C)->preallocated = PETSC_TRUE;
4746: } else {
4748: c = (Mat_SeqAIJ *)(*C)->data;
4749: if (c->nz) {
4750: Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4754: MatSeqAIJCUSPARSECopyToGPU(A);
4755: MatSeqAIJCUSPARSECopyToGPU(B);
4758: Acsr = (CsrMatrix *)Acusp->mat->mat;
4759: Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4760: Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4766: auto pmid = Ccusp->cooPerm->begin();
4767: thrust::advance(pmid, Acsr->num_entries);
4768: PetscLogGpuTimeBegin();
4769: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
4770: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4771: thrust::for_each(zibait, zieait, VecCUDAEquals());
4772: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4773: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4774: thrust::for_each(zibbit, ziebit, VecCUDAEquals());
4775: MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE);
4776: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4778: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4779: CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4780: CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4781: CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4782: auto vT = CcsrT->values->begin();
4783: if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4784: if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
4785: (*C)->transupdated = PETSC_TRUE;
4786: }
4787: PetscLogGpuTimeEnd();
4788: }
4789: }
4790: PetscObjectStateIncrease((PetscObject)*C);
4791: (*C)->assembled = PETSC_TRUE;
4792: (*C)->was_assembled = PETSC_FALSE;
4793: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4794: return 0;
4795: }
4797: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4798: {
4799: bool dmem;
4800: const PetscScalar *av;
4802: dmem = isCudaMem(v);
4803: MatSeqAIJCUSPARSEGetArrayRead(A, &av);
4804: if (n && idx) {
4805: THRUSTINTARRAY widx(n);
4806: widx.assign(idx, idx + n);
4807: PetscLogCpuToGpu(n * sizeof(PetscInt));
4809: THRUSTARRAY *w = NULL;
4810: thrust::device_ptr<PetscScalar> dv;
4811: if (dmem) {
4812: dv = thrust::device_pointer_cast(v);
4813: } else {
4814: w = new THRUSTARRAY(n);
4815: dv = w->data();
4816: }
4817: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4819: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4820: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4821: thrust::for_each(zibit, zieit, VecCUDAEquals());
4822: if (w) cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
4823: delete w;
4824: } else {
4825: cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
4826: }
4827: if (!dmem) PetscLogCpuToGpu(n * sizeof(PetscScalar));
4828: MatSeqAIJCUSPARSERestoreArrayRead(A, &av);
4829: return 0;
4830: }