Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
3: #include <petscconf.h>
4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7: #include <../src/mat/impls/aij/mpi/cupm/mpiaijcupm.hpp>
8: #include <thrust/advance.h>
9: #include <thrust/partition.h>
10: #include <thrust/sort.h>
11: #include <thrust/tuple.h>
12: #include <thrust/unique.h>
14: struct MatMPIAIJCUSPARSE_Policy {
15: typedef Mat_MPIAIJCUSPARSE mat_struct_type;
17: static const char *mpi_mat_type;
18: static const char *seq_mat_type;
19: static const char *vec_seq_type;
20: static const char *set_format_c;
21: static const char *mpi_convert_hypre_c;
23: static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
24: static PetscErrorCode MergeMats(Mat A, Mat B, MatReuse r, Mat *C) { return MatSeqAIJCUSPARSEMergeMats(A, B, r, C); }
26: static PetscErrorCode GetArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArray(A, a); }
27: static PetscErrorCode GetArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArrayWrite(A, a); }
28: static PetscErrorCode RestoreArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArray(A, a); }
29: static PetscErrorCode RestoreArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArrayWrite(A, a); }
31: static PetscErrorCode SetSubMatFormats(Mat A, Mat B, Mat_MPIAIJCUSPARSE *spptr)
32: {
33: PetscFunctionBegin;
34: PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, spptr->diagGPUMatFormat));
35: PetscCall(MatCUSPARSESetFormat(B, MAT_CUSPARSE_MULT, spptr->offdiagGPUMatFormat));
36: PetscFunctionReturn(PETSC_SUCCESS);
37: }
38: };
39: const char *MatMPIAIJCUSPARSE_Policy::mpi_mat_type = MATMPIAIJCUSPARSE;
40: const char *MatMPIAIJCUSPARSE_Policy::seq_mat_type = MATSEQAIJCUSPARSE;
41: const char *MatMPIAIJCUSPARSE_Policy::vec_seq_type = VECSEQCUDA;
42: const char *MatMPIAIJCUSPARSE_Policy::set_format_c = "MatCUSPARSESetFormat_C";
43: const char *MatMPIAIJCUSPARSE_Policy::mpi_convert_hypre_c = "MatConvert_mpiaijcusparse_hypre_C";
45: using MatMPIAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatMPIAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatMPIAIJCUSPARSE_Policy>;
47: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
48: {
49: return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j);
50: }
52: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
53: {
54: return MatMPIAIJCUSPARSE_CUPM_t::SetValuesCOO(mat, v, imode);
55: }
57: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
58: {
59: return MatMPIAIJCUSPARSE_CUPM_t::GetLocalMatMerge(A, scall, glob, A_loc);
60: }
62: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
63: {
64: return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocation(B, d_nz, d_nnz, o_nz, o_nnz);
65: }
67: static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
68: {
69: return MatMPIAIJCUSPARSE_CUPM_t::Mult(A, xx, yy);
70: }
72: static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
73: {
74: return MatMPIAIJCUSPARSE_CUPM_t::ZeroEntries(A);
75: }
77: static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
78: {
79: return MatMPIAIJCUSPARSE_CUPM_t::MultAdd(A, xx, yy, zz);
80: }
82: static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
83: {
84: return MatMPIAIJCUSPARSE_CUPM_t::MultTranspose(A, xx, yy);
85: }
87: static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
88: {
89: return MatMPIAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode);
90: }
92: static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
93: {
94: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
95: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
97: PetscFunctionBegin;
98: switch (op) {
99: case MAT_CUSPARSE_MULT_DIAG:
100: cusparseStruct->diagGPUMatFormat = format;
101: break;
102: case MAT_CUSPARSE_MULT_OFFDIAG:
103: cusparseStruct->offdiagGPUMatFormat = format;
104: break;
105: case MAT_CUSPARSE_ALL:
106: cusparseStruct->diagGPUMatFormat = format;
107: cusparseStruct->offdiagGPUMatFormat = format;
108: break;
109: default:
110: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
111: }
112: PetscFunctionReturn(PETSC_SUCCESS);
113: }
115: static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
116: {
117: MatCUSPARSEStorageFormat format;
118: PetscBool flg;
119: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
120: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
122: PetscFunctionBegin;
123: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
124: if (A->factortype == MAT_FACTOR_NONE) {
125: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
126: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
127: PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
128: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
129: PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
130: if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
131: }
132: PetscOptionsHeadEnd();
133: PetscFunctionReturn(PETSC_SUCCESS);
134: }
136: static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
137: {
138: return MatMPIAIJCUSPARSE_CUPM_t::Destroy(A);
139: }
141: /* defines MatSetValues_MPICUSPARSE_Hash() */
142: #define TYPE AIJ
143: #define TYPE_AIJ
144: #define SUB_TYPE_CUSPARSE
145: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
146: #undef TYPE
147: #undef TYPE_AIJ
148: #undef SUB_TYPE_CUSPARSE
150: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
151: {
152: Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
153: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
155: PetscFunctionBegin;
156: PetscCall(MatSetUp_MPI_Hash(A));
157: PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
158: PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
159: A->preallocated = PETSC_TRUE;
160: PetscFunctionReturn(PETSC_SUCCESS);
161: }
163: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
164: {
165: Mat_MPIAIJ *a;
166: Mat A;
168: PetscFunctionBegin;
169: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
170: if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
171: else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
172: A = *newmat;
173: A->boundtocpu = PETSC_FALSE;
174: PetscCall(PetscFree(A->defaultvectype));
175: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
177: a = (Mat_MPIAIJ *)A->data;
178: if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
179: if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
180: if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
182: if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
184: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
185: A->ops->mult = MatMult_MPIAIJCUSPARSE;
186: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
187: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
188: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
189: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
190: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
191: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
192: A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
193: A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ;
195: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
196: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
197: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
198: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
199: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
200: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
201: #if defined(PETSC_HAVE_HYPRE)
202: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
203: #endif
204: PetscFunctionReturn(PETSC_SUCCESS);
205: }
207: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
208: {
209: PetscFunctionBegin;
210: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
211: PetscCall(MatCreate_MPIAIJ(A));
212: PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
213: PetscFunctionReturn(PETSC_SUCCESS);
214: }
216: /*@
217: MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
218: to NVIDIA GPUs and use the CuSPARSE library for calculations.
220: Collective
222: Input Parameters:
223: + comm - MPI communicator, set to `PETSC_COMM_SELF`
224: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
225: This value should be the same as the local size used in creating the
226: $y$ vector for the matrix-vector product $y = Ax$.
227: . n - This value should be the same as the local size used in creating the
228: $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
229: calculated if `N` is given) For square matrices `n` is almost always `m`.
230: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
231: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
232: . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
233: (same value is used for all local rows)
234: . d_nnz - array containing the number of nonzeros in the various rows of the
235: DIAGONAL portion of the local submatrix (possibly different for each row)
236: or `NULL`, if `d_nz` is used to specify the nonzero structure.
237: The size of this array is equal to the number of local rows, i.e `m`.
238: For matrices you plan to factor you must leave room for the diagonal entry and
239: put in the entry even if it is zero.
240: . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
241: submatrix (same value is used for all local rows).
242: - o_nnz - array containing the number of nonzeros in the various rows of the
243: OFF-DIAGONAL portion of the local submatrix (possibly different for
244: each row) or `NULL`, if `o_nz` is used to specify the nonzero
245: structure. The size of this array is equal to the number
246: of local rows, i.e `m`.
248: Output Parameter:
249: . A - the matrix
251: Level: intermediate
253: Notes:
254: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
255: MatXXXXSetPreallocation() paradigm instead of this routine directly.
256: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
258: The AIJ format, also called the
259: compressed row storage), is fully compatible with standard Fortran
260: storage. That is, the stored row and column indices can begin at
261: either one (as in Fortran) or zero.
263: When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
265: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
266: @*/
267: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
268: {
269: return MatMPIAIJCUSPARSE_CUPM_t::CreateAIJ(comm, m, n, M, N, d_nz, d_nnz, o_nz, o_nnz, A);
270: }
272: /*MC
273: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.
275: Options Database Keys:
276: + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
277: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
278: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
279: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
281: Level: beginner
283: Notes:
284: These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.
286: All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
288: Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
289: if some integer values passed in do not fit in `int`.
291: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
292: M*/
294: /*MC
295: MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.
297: Level: beginner
299: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
300: M*/