Actual source code: mpiaijcusparse.cu

  1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  3: #include <petscconf.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  7: #include <../src/mat/impls/aij/mpi/cupm/mpiaijcupm.hpp>
  8: #include <thrust/advance.h>
  9: #include <thrust/partition.h>
 10: #include <thrust/sort.h>
 11: #include <thrust/unique.h>

 13: struct MatMPIAIJCUSPARSE_Policy {
 14:   typedef Mat_MPIAIJCUSPARSE mat_struct_type;

 16:   static const char *mpi_mat_type;
 17:   static const char *seq_mat_type;
 18:   static const char *vec_seq_type;
 19:   static const char *set_format_c;
 20:   static const char *mpi_convert_hypre_c;

 22:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
 23:   static PetscErrorCode MergeMats(Mat A, Mat B, MatReuse r, Mat *C) { return MatSeqAIJCUSPARSEMergeMats(A, B, r, C); }

 25:   static PetscErrorCode GetArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArray(A, a); }
 26:   static PetscErrorCode GetArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArrayWrite(A, a); }
 27:   static PetscErrorCode RestoreArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArray(A, a); }
 28:   static PetscErrorCode RestoreArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArrayWrite(A, a); }

 30:   static PetscErrorCode SetSubMatFormats(Mat A, Mat B, Mat_MPIAIJCUSPARSE *spptr)
 31:   {
 32:     PetscFunctionBegin;
 33:     PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, spptr->diagGPUMatFormat));
 34:     PetscCall(MatCUSPARSESetFormat(B, MAT_CUSPARSE_MULT, spptr->offdiagGPUMatFormat));
 35:     PetscFunctionReturn(PETSC_SUCCESS);
 36:   }
 37: };
 38: const char *MatMPIAIJCUSPARSE_Policy::mpi_mat_type        = MATMPIAIJCUSPARSE;
 39: const char *MatMPIAIJCUSPARSE_Policy::seq_mat_type        = MATSEQAIJCUSPARSE;
 40: const char *MatMPIAIJCUSPARSE_Policy::vec_seq_type        = VECSEQCUDA;
 41: const char *MatMPIAIJCUSPARSE_Policy::set_format_c        = "MatCUSPARSESetFormat_C";
 42: const char *MatMPIAIJCUSPARSE_Policy::mpi_convert_hypre_c = "MatConvert_mpiaijcusparse_hypre_C";

 44: using MatMPIAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatMPIAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatMPIAIJCUSPARSE_Policy>;

 46: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
 47: {
 48:   return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j);
 49: }

 51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
 52: {
 53:   return MatMPIAIJCUSPARSE_CUPM_t::SetValuesCOO(mat, v, imode);
 54: }

 56: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
 57: {
 58:   return MatMPIAIJCUSPARSE_CUPM_t::GetLocalMatMerge(A, scall, glob, A_loc);
 59: }

 61: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
 62: {
 63:   return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocation(B, d_nz, d_nnz, o_nz, o_nnz);
 64: }

 66: static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
 67: {
 68:   return MatMPIAIJCUSPARSE_CUPM_t::Mult(A, xx, yy);
 69: }

 71: static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
 72: {
 73:   return MatMPIAIJCUSPARSE_CUPM_t::ZeroEntries(A);
 74: }

 76: static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
 77: {
 78:   return MatMPIAIJCUSPARSE_CUPM_t::MultAdd(A, xx, yy, zz);
 79: }

 81: static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
 82: {
 83:   return MatMPIAIJCUSPARSE_CUPM_t::MultTranspose(A, xx, yy);
 84: }

 86: static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
 87: {
 88:   return MatMPIAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode);
 89: }

 91: static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 92: {
 93:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
 94:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

 96:   PetscFunctionBegin;
 97:   switch (op) {
 98:   case MAT_CUSPARSE_MULT_DIAG:
 99:     cusparseStruct->diagGPUMatFormat = format;
100:     break;
101:   case MAT_CUSPARSE_MULT_OFFDIAG:
102:     cusparseStruct->offdiagGPUMatFormat = format;
103:     break;
104:   case MAT_CUSPARSE_ALL:
105:     cusparseStruct->diagGPUMatFormat    = format;
106:     cusparseStruct->offdiagGPUMatFormat = format;
107:     break;
108:   default:
109:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
110:   }
111:   PetscFunctionReturn(PETSC_SUCCESS);
112: }

114: static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
115: {
116:   MatCUSPARSEStorageFormat format;
117:   PetscBool                flg;
118:   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
119:   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

121:   PetscFunctionBegin;
122:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
123:   if (A->factortype == MAT_FACTOR_NONE) {
124:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
125:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
126:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
127:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
128:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
129:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
130:   }
131:   PetscOptionsHeadEnd();
132:   PetscFunctionReturn(PETSC_SUCCESS);
133: }

135: static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
136: {
137:   return MatMPIAIJCUSPARSE_CUPM_t::Destroy(A);
138: }

140: /* defines MatSetValues_MPICUSPARSE_Hash() */
141: #define TYPE AIJ
142: #define TYPE_AIJ
143: #define SUB_TYPE_CUSPARSE
144: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
145: #undef TYPE
146: #undef TYPE_AIJ
147: #undef SUB_TYPE_CUSPARSE

149: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
150: {
151:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
152:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;

154:   PetscFunctionBegin;
155:   PetscCall(MatSetUp_MPI_Hash(A));
156:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
157:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
158:   A->preallocated = PETSC_TRUE;
159:   PetscFunctionReturn(PETSC_SUCCESS);
160: }

162: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
163: {
164:   Mat_MPIAIJ *a;
165:   Mat         A;

167:   PetscFunctionBegin;
168:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
169:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
170:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
171:   A             = *newmat;
172:   A->boundtocpu = PETSC_FALSE;
173:   PetscCall(PetscFree(A->defaultvectype));
174:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));

176:   a = (Mat_MPIAIJ *)A->data;
177:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
178:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
179:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));

181:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);

183:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
184:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
185:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
186:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
187:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
188:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
189:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
190:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
191:   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
192:   A->ops->getcurrentmemtype     = MatGetCurrentMemType_MPIAIJ;

194:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
195:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
196:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
197:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
198:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
199:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
200: #if defined(PETSC_HAVE_HYPRE)
201:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
202: #endif
203:   PetscFunctionReturn(PETSC_SUCCESS);
204: }

206: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
207: {
208:   PetscFunctionBegin;
209:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
210:   PetscCall(MatCreate_MPIAIJ(A));
211:   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
212:   PetscFunctionReturn(PETSC_SUCCESS);
213: }

215: /*@
216:   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
217:   to NVIDIA GPUs and use the CuSPARSE library for calculations.

219:   Collective

221:   Input Parameters:
222: + comm  - MPI communicator, set to `PETSC_COMM_SELF`
223: . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
224:           This value should be the same as the local size used in creating the
225:           $y$ vector for the matrix-vector product $y = Ax$.
226: . n     - This value should be the same as the local size used in creating the
227:           $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
228:           calculated if `N` is given) For square matrices `n` is almost always `m`.
229: . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
230: . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
231: . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
232:           (same value is used for all local rows)
233: . d_nnz - array containing the number of nonzeros in the various rows of the
234:           DIAGONAL portion of the local submatrix (possibly different for each row)
235:           or `NULL`, if `d_nz` is used to specify the nonzero structure.
236:           The size of this array is equal to the number of local rows, i.e `m`.
237:           For matrices you plan to factor you must leave room for the diagonal entry and
238:           put in the entry even if it is zero.
239: . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
240:           submatrix (same value is used for all local rows).
241: - o_nnz - array containing the number of nonzeros in the various rows of the
242:           OFF-DIAGONAL portion of the local submatrix (possibly different for
243:           each row) or `NULL`, if `o_nz` is used to specify the nonzero
244:           structure. The size of this array is equal to the number
245:           of local rows, i.e `m`.

247:   Output Parameter:
248: . A - the matrix

250:   Level: intermediate

252:   Notes:
253:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
254:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
255:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

257:   The AIJ format, also called the
258:   compressed row storage), is fully compatible with standard Fortran
259:   storage.  That is, the stored row and column indices can begin at
260:   either one (as in Fortran) or zero.

262:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

264: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
265: @*/
266: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
267: {
268:   return MatMPIAIJCUSPARSE_CUPM_t::CreateAIJ(comm, m, n, M, N, d_nz, d_nnz, o_nz, o_nnz, A);
269: }

271: /*MC
272:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.

274:    Options Database Keys:
275: +  -mat_type mpiaijcusparse                      - sets the matrix type to `MATMPIAIJCUSPARSE`
276: .  -mat_cusparse_storage_format csr              - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
277: .  -mat_cusparse_mult_diag_storage_format csr    - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
278: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

280:   Level: beginner

282:   Notes:
283:   These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.

285:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

287:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
288:   if some integer values passed in do not fit in `int`.

290: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
291: M*/

293: /*MC
294:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.

296:   Level: beginner

298: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
299: M*/