Actual source code: mpiaijcusparse.cu

  1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  3: #include <petscconf.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  7: #include <../src/mat/impls/aij/mpi/cupm/mpiaijcupm.hpp>
  8: #include <thrust/advance.h>
  9: #include <thrust/partition.h>
 10: #include <thrust/sort.h>
 11: #include <thrust/tuple.h>
 12: #include <thrust/unique.h>

 14: struct MatMPIAIJCUSPARSE_Policy {
 15:   typedef Mat_MPIAIJCUSPARSE mat_struct_type;

 17:   static const char *mpi_mat_type;
 18:   static const char *seq_mat_type;
 19:   static const char *vec_seq_type;
 20:   static const char *set_format_c;
 21:   static const char *mpi_convert_hypre_c;

 23:   static PetscErrorCode CopyToGPU(Mat A) { return MatSeqAIJCUSPARSECopyToGPU(A); }
 24:   static PetscErrorCode MergeMats(Mat A, Mat B, MatReuse r, Mat *C) { return MatSeqAIJCUSPARSEMergeMats(A, B, r, C); }

 26:   static PetscErrorCode GetArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArray(A, a); }
 27:   static PetscErrorCode GetArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSEGetArrayWrite(A, a); }
 28:   static PetscErrorCode RestoreArray(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArray(A, a); }
 29:   static PetscErrorCode RestoreArrayWrite(Mat A, PetscScalar **a) { return MatSeqAIJCUSPARSERestoreArrayWrite(A, a); }

 31:   static PetscErrorCode SetSubMatFormats(Mat A, Mat B, Mat_MPIAIJCUSPARSE *spptr)
 32:   {
 33:     PetscFunctionBegin;
 34:     PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, spptr->diagGPUMatFormat));
 35:     PetscCall(MatCUSPARSESetFormat(B, MAT_CUSPARSE_MULT, spptr->offdiagGPUMatFormat));
 36:     PetscFunctionReturn(PETSC_SUCCESS);
 37:   }
 38: };
 39: const char *MatMPIAIJCUSPARSE_Policy::mpi_mat_type        = MATMPIAIJCUSPARSE;
 40: const char *MatMPIAIJCUSPARSE_Policy::seq_mat_type        = MATSEQAIJCUSPARSE;
 41: const char *MatMPIAIJCUSPARSE_Policy::vec_seq_type        = VECSEQCUDA;
 42: const char *MatMPIAIJCUSPARSE_Policy::set_format_c        = "MatCUSPARSESetFormat_C";
 43: const char *MatMPIAIJCUSPARSE_Policy::mpi_convert_hypre_c = "MatConvert_mpiaijcusparse_hypre_C";

 45: using MatMPIAIJCUSPARSE_CUPM_t = Petsc::mat::aij::cupm::impl::MatMPIAIJCUSPARSE_CUPM<Petsc::device::cupm::DeviceType::CUDA, MatMPIAIJCUSPARSE_Policy>;

 47: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
 48: {
 49:   return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocationCOO(mat, coo_n, coo_i, coo_j);
 50: }

 52: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
 53: {
 54:   return MatMPIAIJCUSPARSE_CUPM_t::SetValuesCOO(mat, v, imode);
 55: }

 57: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
 58: {
 59:   return MatMPIAIJCUSPARSE_CUPM_t::GetLocalMatMerge(A, scall, glob, A_loc);
 60: }

 62: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
 63: {
 64:   return MatMPIAIJCUSPARSE_CUPM_t::SetPreallocation(B, d_nz, d_nnz, o_nz, o_nnz);
 65: }

 67: static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
 68: {
 69:   return MatMPIAIJCUSPARSE_CUPM_t::Mult(A, xx, yy);
 70: }

 72: static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
 73: {
 74:   return MatMPIAIJCUSPARSE_CUPM_t::ZeroEntries(A);
 75: }

 77: static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
 78: {
 79:   return MatMPIAIJCUSPARSE_CUPM_t::MultAdd(A, xx, yy, zz);
 80: }

 82: static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
 83: {
 84:   return MatMPIAIJCUSPARSE_CUPM_t::MultTranspose(A, xx, yy);
 85: }

 87: static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
 88: {
 89:   return MatMPIAIJCUSPARSE_CUPM_t::AssemblyEnd(A, mode);
 90: }

 92: static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
 93: {
 94:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
 95:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

 97:   PetscFunctionBegin;
 98:   switch (op) {
 99:   case MAT_CUSPARSE_MULT_DIAG:
100:     cusparseStruct->diagGPUMatFormat = format;
101:     break;
102:   case MAT_CUSPARSE_MULT_OFFDIAG:
103:     cusparseStruct->offdiagGPUMatFormat = format;
104:     break;
105:   case MAT_CUSPARSE_ALL:
106:     cusparseStruct->diagGPUMatFormat    = format;
107:     cusparseStruct->offdiagGPUMatFormat = format;
108:     break;
109:   default:
110:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
111:   }
112:   PetscFunctionReturn(PETSC_SUCCESS);
113: }

115: static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
116: {
117:   MatCUSPARSEStorageFormat format;
118:   PetscBool                flg;
119:   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
120:   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

122:   PetscFunctionBegin;
123:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
124:   if (A->factortype == MAT_FACTOR_NONE) {
125:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
126:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
127:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
128:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
129:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
130:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
131:   }
132:   PetscOptionsHeadEnd();
133:   PetscFunctionReturn(PETSC_SUCCESS);
134: }

136: static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
137: {
138:   return MatMPIAIJCUSPARSE_CUPM_t::Destroy(A);
139: }

141: /* defines MatSetValues_MPICUSPARSE_Hash() */
142: #define TYPE AIJ
143: #define TYPE_AIJ
144: #define SUB_TYPE_CUSPARSE
145: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
146: #undef TYPE
147: #undef TYPE_AIJ
148: #undef SUB_TYPE_CUSPARSE

150: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
151: {
152:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
153:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;

155:   PetscFunctionBegin;
156:   PetscCall(MatSetUp_MPI_Hash(A));
157:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
158:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
159:   A->preallocated = PETSC_TRUE;
160:   PetscFunctionReturn(PETSC_SUCCESS);
161: }

163: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
164: {
165:   Mat_MPIAIJ *a;
166:   Mat         A;

168:   PetscFunctionBegin;
169:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
170:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
171:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
172:   A             = *newmat;
173:   A->boundtocpu = PETSC_FALSE;
174:   PetscCall(PetscFree(A->defaultvectype));
175:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));

177:   a = (Mat_MPIAIJ *)A->data;
178:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
179:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
180:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));

182:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);

184:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
185:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
186:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
187:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
188:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
189:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
190:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
191:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
192:   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
193:   A->ops->getcurrentmemtype     = MatGetCurrentMemType_MPIAIJ;

195:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
196:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
197:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
198:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
199:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
200:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
201: #if defined(PETSC_HAVE_HYPRE)
202:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
203: #endif
204:   PetscFunctionReturn(PETSC_SUCCESS);
205: }

207: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
208: {
209:   PetscFunctionBegin;
210:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
211:   PetscCall(MatCreate_MPIAIJ(A));
212:   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
213:   PetscFunctionReturn(PETSC_SUCCESS);
214: }

216: /*@
217:   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
218:   to NVIDIA GPUs and use the CuSPARSE library for calculations.

220:   Collective

222:   Input Parameters:
223: + comm  - MPI communicator, set to `PETSC_COMM_SELF`
224: . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
225:           This value should be the same as the local size used in creating the
226:           $y$ vector for the matrix-vector product $y = Ax$.
227: . n     - This value should be the same as the local size used in creating the
228:           $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
229:           calculated if `N` is given) For square matrices `n` is almost always `m`.
230: . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
231: . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
232: . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
233:           (same value is used for all local rows)
234: . d_nnz - array containing the number of nonzeros in the various rows of the
235:           DIAGONAL portion of the local submatrix (possibly different for each row)
236:           or `NULL`, if `d_nz` is used to specify the nonzero structure.
237:           The size of this array is equal to the number of local rows, i.e `m`.
238:           For matrices you plan to factor you must leave room for the diagonal entry and
239:           put in the entry even if it is zero.
240: . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
241:           submatrix (same value is used for all local rows).
242: - o_nnz - array containing the number of nonzeros in the various rows of the
243:           OFF-DIAGONAL portion of the local submatrix (possibly different for
244:           each row) or `NULL`, if `o_nz` is used to specify the nonzero
245:           structure. The size of this array is equal to the number
246:           of local rows, i.e `m`.

248:   Output Parameter:
249: . A - the matrix

251:   Level: intermediate

253:   Notes:
254:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
255:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
256:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

258:   The AIJ format, also called the
259:   compressed row storage), is fully compatible with standard Fortran
260:   storage.  That is, the stored row and column indices can begin at
261:   either one (as in Fortran) or zero.

263:   When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`

265: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
266: @*/
267: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
268: {
269:   return MatMPIAIJCUSPARSE_CUPM_t::CreateAIJ(comm, m, n, M, N, d_nz, d_nnz, o_nz, o_nnz, A);
270: }

272: /*MC
273:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.

275:    Options Database Keys:
276: +  -mat_type mpiaijcusparse                      - sets the matrix type to `MATMPIAIJCUSPARSE`
277: .  -mat_cusparse_storage_format csr              - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
278: .  -mat_cusparse_mult_diag_storage_format csr    - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
279: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

281:   Level: beginner

283:   Notes:
284:   These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.

286:   All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.

288:   Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
289:   if some integer values passed in do not fit in `int`.

291: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
292: M*/

294: /*MC
295:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.

297:   Level: beginner

299: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
300: M*/