Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsys.h>
10: #include <KokkosBlas.hpp>
11: #include <KokkosSparse_CrsMatrix.hpp>
13: // To suppress compiler warnings:
14: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
15: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
16: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
17: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
18: #define DISABLE_CUSPARSE_DEPRECATED
19: #include <KokkosSparse_spmv.hpp>
21: #include <KokkosSparse_spiluk.hpp>
22: #include <KokkosSparse_sptrsv.hpp>
23: #include <KokkosSparse_spgemm.hpp>
24: #include <KokkosSparse_spadd.hpp>
25: #include <KokkosBatched_LU_Decl.hpp>
26: #include <KokkosBatched_InverseLU_Decl.hpp>
28: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
31: #include <KokkosSparse_Utils.hpp>
32: using KokkosSparse::sort_crs_matrix;
33: using KokkosSparse::Impl::transpose_matrix;
34: #else
35: #include <KokkosKernels_Sorting.hpp>
36: using KokkosKernels::sort_crs_matrix;
37: using KokkosKernels::Impl::transpose_matrix;
38: #endif
40: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
41: using KokkosSparse::spiluk_symbolic;
42: using KokkosSparse::spiluk_numeric;
43: using KokkosSparse::sptrsv_symbolic;
44: using KokkosSparse::sptrsv_solve;
45: using KokkosSparse::Experimental::SPTRSVAlgorithm;
46: using KokkosSparse::Experimental::SPILUKAlgorithm;
47: #else
48: using KokkosSparse::Experimental::spiluk_symbolic;
49: using KokkosSparse::Experimental::spiluk_numeric;
50: using KokkosSparse::Experimental::sptrsv_symbolic;
51: using KokkosSparse::Experimental::sptrsv_solve;
52: using KokkosSparse::Experimental::SPTRSVAlgorithm;
53: using KokkosSparse::Experimental::SPILUKAlgorithm;
54: #endif
56: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
59: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
60: In the latter case, it is important to set a_dual's sync state correctly.
61: */
62: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
63: {
64: Mat_SeqAIJ *aijseq;
65: Mat_SeqAIJKokkos *aijkok;
67: PetscFunctionBegin;
68: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
69: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
72: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74: /* If aijkok does not exist, we just copy i, j to device.
75: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
76: In both cases, we build a new aijkok structure.
77: */
78: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
79: if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
80: PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
81: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
82: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
83: PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
84: PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
85: PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
86: aijseq->free_a = PETSC_TRUE;
87: aijseq->free_ij = PETSC_TRUE;
88: /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
89: we have the CsrMatrix on device first and then copy to host, followed by
90: MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
91: One could improve it by not using NULL garray.
92: */
93: }
94: delete aijkok;
95: aijkok = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
96: A->spptr = aijkok;
97: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
98: const PetscInt *adiag;
99: /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
100: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
101: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
102: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
103: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
104: }
105: PetscFunctionReturn(PETSC_SUCCESS);
106: }
108: /* Sync CSR data to device if not yet */
109: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
110: {
111: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113: PetscFunctionBegin;
114: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
115: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
116: if (aijkok->a_dual.need_sync_device()) {
117: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
118: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
119: aijkok->hermitian_updated = PETSC_FALSE;
120: }
121: PetscFunctionReturn(PETSC_SUCCESS);
122: }
124: /* Mark the CSR data on device as modified */
125: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
126: {
127: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129: PetscFunctionBegin;
130: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
131: aijkok->a_dual.clear_sync_state();
132: aijkok->a_dual.modify_device();
133: aijkok->transpose_updated = PETSC_FALSE;
134: aijkok->hermitian_updated = PETSC_FALSE;
135: PetscCall(PetscObjectStateIncrease((PetscObject)A));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140: {
141: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142: auto exec = PetscGetKokkosExecutionSpace();
144: PetscFunctionBegin;
145: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146: /* We do not expect one needs factors on host */
147: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154: {
155: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157: PetscFunctionBegin;
158: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162: */
163: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165: *array = aijkok->a_dual.view_host().data();
166: } else { /* Happens when calling MatSetValues on a newly created matrix */
167: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168: }
169: PetscFunctionReturn(PETSC_SUCCESS);
170: }
172: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173: {
174: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176: PetscFunctionBegin;
177: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178: PetscFunctionReturn(PETSC_SUCCESS);
179: }
181: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182: {
183: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185: PetscFunctionBegin;
186: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188: *array = aijkok->a_dual.view_host().data();
189: } else {
190: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191: }
192: PetscFunctionReturn(PETSC_SUCCESS);
193: }
195: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196: {
197: PetscFunctionBegin;
198: *array = NULL;
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203: {
204: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206: PetscFunctionBegin;
207: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208: *array = aijkok->a_dual.view_host().data();
209: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211: }
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216: {
217: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219: PetscFunctionBegin;
220: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221: aijkok->a_dual.clear_sync_state();
222: aijkok->a_dual.modify_host();
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228: {
229: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231: PetscFunctionBegin;
232: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234: if (i) *i = aijkok->i_device_data();
235: if (j) *j = aijkok->j_device_data();
236: if (a) {
237: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238: *a = aijkok->a_device_data();
239: }
240: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241: PetscFunctionReturn(PETSC_SUCCESS);
242: }
244: static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
245: {
246: PetscFunctionBegin;
247: *mtype = PETSC_MEMTYPE_KOKKOS;
248: PetscFunctionReturn(PETSC_SUCCESS);
249: }
251: /*
252: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
254: Input Parameter:
255: . A - the MATSEQAIJKOKKOS matrix
257: Output Parameters:
258: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
259: - T_d - the transpose on device, whose value array is allocated but not initialized
260: */
261: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
262: {
263: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
264: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
265: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
266: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
267: MatRowMapType *Ti = Ti_h.data();
268: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
269: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
270: PetscInt *Tj = Tj_h.data();
271: PetscInt *perm = perm_h.data();
272: PetscInt *offset;
274: PetscFunctionBegin;
275: // Populate Ti
276: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
277: Ti++;
278: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
279: Ti--;
280: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
282: // Populate Tj and the permutation array
283: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
284: for (PetscInt i = 0; i < m; i++) {
285: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
286: PetscInt r = Aj[j]; // row r of T
287: PetscInt disp = Ti[r] + offset[r];
289: Tj[disp] = i; // col i of T
290: perm[disp] = j;
291: offset[r]++;
292: }
293: }
294: PetscCall(PetscFree(offset));
296: // Sort each row of T, along with the permutation array
297: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
299: // Output perm and T on device
300: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
301: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
302: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
303: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
304: PetscFunctionReturn(PETSC_SUCCESS);
305: }
307: // Generate the transpose on device and cache it internally
308: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
309: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
310: {
311: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
312: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
313: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
314: KokkosCsrMatrix &T = akok->csrmatT;
316: PetscFunctionBegin;
317: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
318: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
320: const auto &Aa = akok->a_dual.view_device();
322: if (A->symmetric == PETSC_BOOL3_TRUE) {
323: *csrmatT = akok->csrmat;
324: } else {
325: // See if we already have a cached transpose and its value is up to date
326: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
327: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
328: const auto &perm = akok->transpose_perm; // get the permutation array
329: auto &Ta = T.values;
331: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
332: }
333: } else { // Generate T of size n x m for the first time
334: MatRowMapKokkosView perm;
336: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
337: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
338: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
339: }
340: akok->transpose_updated = PETSC_TRUE;
341: *csrmatT = akok->csrmatT;
342: }
343: PetscFunctionReturn(PETSC_SUCCESS);
344: }
346: // Generate the Hermitian on device and cache it internally
347: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
348: {
349: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
350: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
351: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
352: KokkosCsrMatrix &T = akok->csrmatH;
354: PetscFunctionBegin;
355: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
356: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
358: const auto &Aa = akok->a_dual.view_device();
360: if (A->hermitian == PETSC_BOOL3_TRUE) {
361: *csrmatH = akok->csrmat;
362: } else {
363: // See if we already have a cached Hermitian and its value is up to date
364: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
365: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
366: const auto &perm = akok->transpose_perm; // get the permutation array
367: auto &Ta = T.values;
369: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
370: }
371: } else { // Generate T of size n x m for the first time
372: MatRowMapKokkosView perm;
374: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
375: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
376: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
377: }
378: akok->hermitian_updated = PETSC_TRUE;
379: *csrmatH = akok->csrmatH;
380: }
381: PetscFunctionReturn(PETSC_SUCCESS);
382: }
384: /* y = A x */
385: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
386: {
387: Mat_SeqAIJKokkos *aijkok;
388: ConstPetscScalarKokkosView xv;
389: PetscScalarKokkosView yv;
391: PetscFunctionBegin;
392: PetscCall(PetscLogGpuTimeBegin());
393: PetscCall(MatSeqAIJKokkosSyncDevice(A));
394: PetscCall(VecGetKokkosView(xx, &xv));
395: PetscCall(VecGetKokkosViewWrite(yy, &yv));
396: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
397: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
398: PetscCall(VecRestoreKokkosView(xx, &xv));
399: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
400: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
401: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
402: PetscCall(PetscLogGpuTimeEnd());
403: PetscFunctionReturn(PETSC_SUCCESS);
404: }
406: /* y = A^T x */
407: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
408: {
409: Mat_SeqAIJKokkos *aijkok;
410: const char *mode;
411: ConstPetscScalarKokkosView xv;
412: PetscScalarKokkosView yv;
413: KokkosCsrMatrix csrmat;
415: PetscFunctionBegin;
416: PetscCall(PetscLogGpuTimeBegin());
417: PetscCall(MatSeqAIJKokkosSyncDevice(A));
418: PetscCall(VecGetKokkosView(xx, &xv));
419: PetscCall(VecGetKokkosViewWrite(yy, &yv));
420: if (A->form_explicit_transpose) {
421: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
422: mode = "N";
423: } else {
424: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
425: csrmat = aijkok->csrmat;
426: mode = "T";
427: }
428: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
429: PetscCall(VecRestoreKokkosView(xx, &xv));
430: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
431: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
432: PetscCall(PetscLogGpuTimeEnd());
433: PetscFunctionReturn(PETSC_SUCCESS);
434: }
436: /* y = A^H x */
437: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
438: {
439: Mat_SeqAIJKokkos *aijkok;
440: const char *mode;
441: ConstPetscScalarKokkosView xv;
442: PetscScalarKokkosView yv;
443: KokkosCsrMatrix csrmat;
445: PetscFunctionBegin;
446: PetscCall(PetscLogGpuTimeBegin());
447: PetscCall(MatSeqAIJKokkosSyncDevice(A));
448: PetscCall(VecGetKokkosView(xx, &xv));
449: PetscCall(VecGetKokkosViewWrite(yy, &yv));
450: if (A->form_explicit_transpose) {
451: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
452: mode = "N";
453: } else {
454: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
455: csrmat = aijkok->csrmat;
456: mode = "C";
457: }
458: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
459: PetscCall(VecRestoreKokkosView(xx, &xv));
460: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
461: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
462: PetscCall(PetscLogGpuTimeEnd());
463: PetscFunctionReturn(PETSC_SUCCESS);
464: }
466: /* z = A x + y */
467: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
468: {
469: Mat_SeqAIJKokkos *aijkok;
470: ConstPetscScalarKokkosView xv;
471: PetscScalarKokkosView zv;
473: PetscFunctionBegin;
474: PetscCall(PetscLogGpuTimeBegin());
475: PetscCall(MatSeqAIJKokkosSyncDevice(A));
476: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
477: PetscCall(VecGetKokkosView(xx, &xv));
478: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
479: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
480: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
481: PetscCall(VecRestoreKokkosView(xx, &xv));
482: PetscCall(VecRestoreKokkosView(zz, &zv));
483: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
484: PetscCall(PetscLogGpuTimeEnd());
485: PetscFunctionReturn(PETSC_SUCCESS);
486: }
488: /* z = A^T x + y */
489: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
490: {
491: Mat_SeqAIJKokkos *aijkok;
492: const char *mode;
493: ConstPetscScalarKokkosView xv;
494: PetscScalarKokkosView zv;
495: KokkosCsrMatrix csrmat;
497: PetscFunctionBegin;
498: PetscCall(PetscLogGpuTimeBegin());
499: PetscCall(MatSeqAIJKokkosSyncDevice(A));
500: if (zz != yy) PetscCall(VecCopy(yy, zz));
501: PetscCall(VecGetKokkosView(xx, &xv));
502: PetscCall(VecGetKokkosView(zz, &zv));
503: if (A->form_explicit_transpose) {
504: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
505: mode = "N";
506: } else {
507: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
508: csrmat = aijkok->csrmat;
509: mode = "T";
510: }
511: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
512: PetscCall(VecRestoreKokkosView(xx, &xv));
513: PetscCall(VecRestoreKokkosView(zz, &zv));
514: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
515: PetscCall(PetscLogGpuTimeEnd());
516: PetscFunctionReturn(PETSC_SUCCESS);
517: }
519: /* z = A^H x + y */
520: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
521: {
522: Mat_SeqAIJKokkos *aijkok;
523: const char *mode;
524: ConstPetscScalarKokkosView xv;
525: PetscScalarKokkosView zv;
526: KokkosCsrMatrix csrmat;
528: PetscFunctionBegin;
529: PetscCall(PetscLogGpuTimeBegin());
530: PetscCall(MatSeqAIJKokkosSyncDevice(A));
531: if (zz != yy) PetscCall(VecCopy(yy, zz));
532: PetscCall(VecGetKokkosView(xx, &xv));
533: PetscCall(VecGetKokkosView(zz, &zv));
534: if (A->form_explicit_transpose) {
535: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
536: mode = "N";
537: } else {
538: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
539: csrmat = aijkok->csrmat;
540: mode = "C";
541: }
542: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
543: PetscCall(VecRestoreKokkosView(xx, &xv));
544: PetscCall(VecRestoreKokkosView(zz, &zv));
545: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
546: PetscCall(PetscLogGpuTimeEnd());
547: PetscFunctionReturn(PETSC_SUCCESS);
548: }
550: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
551: {
552: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
554: PetscFunctionBegin;
555: switch (op) {
556: case MAT_FORM_EXPLICIT_TRANSPOSE:
557: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
558: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
559: A->form_explicit_transpose = flg;
560: break;
561: default:
562: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
563: break;
564: }
565: PetscFunctionReturn(PETSC_SUCCESS);
566: }
568: /* Depending on reuse, either build a new mat, or use the existing mat */
569: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
570: {
571: Mat_SeqAIJ *aseq;
573: PetscFunctionBegin;
574: PetscCall(PetscKokkosInitializeCheck());
575: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
576: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
577: PetscCall(MatSetType(*newmat, mtype));
578: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
579: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
580: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
581: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
582: PetscCall(PetscFree(A->defaultvectype));
583: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
584: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
585: PetscCall(MatSetOps_SeqAIJKokkos(A));
586: aseq = static_cast<Mat_SeqAIJ *>(A->data);
587: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
588: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
589: A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
590: }
591: }
592: PetscFunctionReturn(PETSC_SUCCESS);
593: }
595: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
596: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
597: */
598: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
599: {
600: Mat_SeqAIJ *bseq;
601: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
602: Mat mat;
604: PetscFunctionBegin;
605: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
606: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
607: mat = *B;
608: if (A->assembled) {
609: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
610: bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
611: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
612: /* Now copy values to B if needed */
613: if (dupOption == MAT_COPY_VALUES) {
614: if (akok->a_dual.need_sync_device()) {
615: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
616: bkok->a_dual.modify_host();
617: } else { /* If device has the latest data, we only copy data on device */
618: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
619: bkok->a_dual.modify_device();
620: }
621: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
622: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
623: bkok->a_dual.modify_host();
624: }
625: mat->spptr = bkok;
626: }
628: PetscCall(PetscFree(mat->defaultvectype));
629: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
630: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
631: PetscCall(MatSetOps_SeqAIJKokkos(mat));
632: PetscFunctionReturn(PETSC_SUCCESS);
633: }
635: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
636: {
637: Mat At;
638: KokkosCsrMatrix internT;
639: Mat_SeqAIJKokkos *atkok, *bkok;
641: PetscFunctionBegin;
642: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
643: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
644: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
645: /* Deep copy internT, as we want to isolate the internal transpose */
646: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
647: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
648: if (reuse == MAT_INITIAL_MATRIX) *B = At;
649: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
650: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
651: if ((*B)->assembled) {
652: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
653: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
654: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
655: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
656: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
657: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
658: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
659: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
660: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
661: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
662: }
663: PetscFunctionReturn(PETSC_SUCCESS);
664: }
666: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
667: {
668: Mat_SeqAIJKokkos *aijkok;
670: PetscFunctionBegin;
671: if (A->factortype == MAT_FACTOR_NONE) {
672: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
673: delete aijkok;
674: } else {
675: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
676: }
677: A->spptr = NULL;
678: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
679: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
680: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
681: #if defined(PETSC_HAVE_HYPRE)
682: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
683: #endif
684: PetscCall(MatDestroy_SeqAIJ(A));
685: PetscFunctionReturn(PETSC_SUCCESS);
686: }
688: /*MC
689: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
691: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
693: Options Database Key:
694: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
696: Level: beginner
698: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
699: M*/
700: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
701: {
702: PetscFunctionBegin;
703: PetscCall(PetscKokkosInitializeCheck());
704: PetscCall(MatCreate_SeqAIJ(A));
705: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
706: PetscFunctionReturn(PETSC_SUCCESS);
707: }
709: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
710: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
711: {
712: Mat_SeqAIJ *a, *b;
713: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
714: MatScalarKokkosView aa, ba, ca;
715: MatRowMapKokkosView ai, bi, ci;
716: MatColIdxKokkosView aj, bj, cj;
717: PetscInt m, n, nnz, aN;
719: PetscFunctionBegin;
722: PetscAssertPointer(C, 4);
723: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
724: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
725: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
726: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
728: PetscCall(MatSeqAIJKokkosSyncDevice(A));
729: PetscCall(MatSeqAIJKokkosSyncDevice(B));
730: a = static_cast<Mat_SeqAIJ *>(A->data);
731: b = static_cast<Mat_SeqAIJ *>(B->data);
732: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
733: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
734: aa = akok->a_dual.view_device();
735: ai = akok->i_dual.view_device();
736: ba = bkok->a_dual.view_device();
737: bi = bkok->i_dual.view_device();
738: m = A->rmap->n; /* M, N and nnz of C */
739: n = A->cmap->n + B->cmap->n;
740: nnz = a->nz + b->nz;
741: aN = A->cmap->n; /* N of A */
742: if (reuse == MAT_INITIAL_MATRIX) {
743: aj = akok->j_dual.view_device();
744: bj = bkok->j_dual.view_device();
745: auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
746: auto ci = MatRowMapKokkosView("i", ai.extent(0));
747: auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
749: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
750: Kokkos::parallel_for(
751: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
752: PetscInt i = t.league_rank(); /* row i */
753: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
756: ci(i) = coffset;
757: if (i == m - 1) ci(m) = ai(m) + bi(m);
758: });
760: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
761: if (k < alen) {
762: ca(coffset + k) = aa(ai(i) + k);
763: cj(coffset + k) = aj(ai(i) + k);
764: } else {
765: ca(coffset + k) = ba(bi(i) + k - alen);
766: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
767: }
768: });
769: });
770: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
771: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
772: } else if (reuse == MAT_REUSE_MATRIX) {
774: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
775: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
776: ca = ckok->a_dual.view_device();
777: ci = ckok->i_dual.view_device();
779: Kokkos::parallel_for(
780: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
781: PetscInt i = t.league_rank(); /* row i */
782: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
783: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
784: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
785: else ca(ci(i) + k) = ba(bi(i) + k - alen);
786: });
787: });
788: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
789: }
790: PetscFunctionReturn(PETSC_SUCCESS);
791: }
793: static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(PetscCtxRt pdata)
794: {
795: PetscFunctionBegin;
796: delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
797: PetscFunctionReturn(PETSC_SUCCESS);
798: }
800: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
801: {
802: Mat_Product *product = C->product;
803: Mat A, B;
804: bool transA, transB; /* use bool, since KK needs this type */
805: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
806: Mat_SeqAIJ *c;
807: MatProductCtx_SeqAIJKokkos *pdata;
808: KokkosCsrMatrix csrmatA, csrmatB;
810: PetscFunctionBegin;
811: MatCheckProduct(C, 1);
812: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
813: pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
815: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
816: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
817: // we still do numeric.
818: if (pdata->reusesym) { // numeric reuses results from symbolic
819: pdata->reusesym = PETSC_FALSE;
820: PetscFunctionReturn(PETSC_SUCCESS);
821: }
823: switch (product->type) {
824: case MATPRODUCT_AB:
825: transA = false;
826: transB = false;
827: break;
828: case MATPRODUCT_AtB:
829: transA = true;
830: transB = false;
831: break;
832: case MATPRODUCT_ABt:
833: transA = false;
834: transB = true;
835: break;
836: default:
837: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
838: }
840: A = product->A;
841: B = product->B;
842: PetscCall(MatSeqAIJKokkosSyncDevice(A));
843: PetscCall(MatSeqAIJKokkosSyncDevice(B));
844: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
845: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
846: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
848: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
850: csrmatA = akok->csrmat;
851: csrmatB = bkok->csrmat;
853: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
854: if (transA) {
855: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
856: transA = false;
857: }
859: if (transB) {
860: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
861: transB = false;
862: }
863: PetscCall(PetscLogGpuTimeBegin());
864: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
865: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
866: auto spgemmHandle = pdata->kh.get_spgemm_handle();
867: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
868: #endif
870: PetscCall(PetscLogGpuTimeEnd());
871: PetscCall(MatSeqAIJKokkosModifyDevice(C));
872: /* shorter version of MatAssemblyEnd_SeqAIJ */
873: c = (Mat_SeqAIJ *)C->data;
874: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
875: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
876: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
877: c->reallocs = 0;
878: C->info.mallocs = 0;
879: C->info.nz_unneeded = 0;
880: C->assembled = C->was_assembled = PETSC_TRUE;
881: C->num_ass++;
882: PetscFunctionReturn(PETSC_SUCCESS);
883: }
885: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
886: {
887: Mat_Product *product = C->product;
888: MatProductType ptype;
889: Mat A, B;
890: bool transA, transB;
891: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
892: MatProductCtx_SeqAIJKokkos *pdata;
893: MPI_Comm comm;
894: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
896: PetscFunctionBegin;
897: MatCheckProduct(C, 1);
898: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
899: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
900: A = product->A;
901: B = product->B;
902: PetscCall(MatSeqAIJKokkosSyncDevice(A));
903: PetscCall(MatSeqAIJKokkosSyncDevice(B));
904: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
905: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
906: csrmatA = akok->csrmat;
907: csrmatB = bkok->csrmat;
909: ptype = product->type;
910: // Take advantage of the symmetry if true
911: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
912: ptype = MATPRODUCT_AB;
913: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
914: }
915: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
916: ptype = MATPRODUCT_AB;
917: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
918: }
920: switch (ptype) {
921: case MATPRODUCT_AB:
922: transA = false;
923: transB = false;
924: PetscCall(MatSetBlockSizesFromMats(C, A, B));
925: break;
926: case MATPRODUCT_AtB:
927: transA = true;
928: transB = false;
929: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
930: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
931: break;
932: case MATPRODUCT_ABt:
933: transA = false;
934: transB = true;
935: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
936: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
937: break;
938: default:
939: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
940: }
941: PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
942: pdata->reusesym = product->api_user;
944: /* TODO: add command line options to select spgemm algorithms */
945: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
947: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
948: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
949: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
950: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
951: #endif
952: #endif
953: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
955: PetscCall(PetscLogGpuTimeBegin());
956: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
957: if (transA) {
958: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
959: transA = false;
960: }
962: if (transB) {
963: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
964: transB = false;
965: }
967: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
968: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
969: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
970: calling new Mat_SeqAIJKokkos().
971: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
972: */
973: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
974: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
975: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
976: auto spgemmHandle = pdata->kh.get_spgemm_handle();
977: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
978: #endif
979: PetscCall(PetscLogGpuTimeEnd());
981: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
982: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
983: C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
984: PetscFunctionReturn(PETSC_SUCCESS);
985: }
987: /* handles sparse matrix matrix ops */
988: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
989: {
990: Mat_Product *product = mat->product;
991: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
993: PetscFunctionBegin;
994: MatCheckProduct(mat, 1);
995: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
996: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
997: if (Biskok && Ciskok) {
998: switch (product->type) {
999: case MATPRODUCT_AB:
1000: case MATPRODUCT_AtB:
1001: case MATPRODUCT_ABt:
1002: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1003: break;
1004: case MATPRODUCT_PtAP:
1005: case MATPRODUCT_RARt:
1006: case MATPRODUCT_ABC:
1007: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1008: break;
1009: default:
1010: break;
1011: }
1012: } else { /* fallback for AIJ */
1013: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1014: }
1015: PetscFunctionReturn(PETSC_SUCCESS);
1016: }
1018: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1019: {
1020: Mat_SeqAIJKokkos *aijkok;
1022: PetscFunctionBegin;
1023: PetscCall(PetscLogGpuTimeBegin());
1024: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1025: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1026: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1027: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1028: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1029: PetscCall(PetscLogGpuTimeEnd());
1030: PetscFunctionReturn(PETSC_SUCCESS);
1031: }
1033: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1034: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1035: {
1036: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1038: PetscFunctionBegin;
1039: if (A->assembled && aijseq->diagDense) { // no missing diagonals
1040: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1042: PetscCall(PetscLogGpuTimeBegin());
1043: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1044: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1045: const auto &Aa = aijkok->a_dual.view_device();
1046: const auto &Adiag = aijkok->diag_dual.view_device();
1047: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1048: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1049: PetscCall(PetscLogGpuFlops(n));
1050: PetscCall(PetscLogGpuTimeEnd());
1051: } else { // need reassembly, very slow!
1052: PetscCall(MatShift_Basic(A, a));
1053: }
1054: PetscFunctionReturn(PETSC_SUCCESS);
1055: }
1057: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1058: {
1059: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1061: PetscFunctionBegin;
1062: if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1063: ConstPetscScalarKokkosView dv;
1064: PetscInt n, nv;
1066: PetscCall(PetscLogGpuTimeBegin());
1067: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1068: PetscCall(VecGetKokkosView(D, &dv));
1069: PetscCall(VecGetLocalSize(D, &nv));
1070: n = PetscMin(Y->rmap->n, Y->cmap->n);
1071: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1073: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1074: const auto &Aa = aijkok->a_dual.view_device();
1075: const auto &Adiag = aijkok->diag_dual.view_device();
1076: PetscCallCXX(Kokkos::parallel_for(
1077: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1078: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1079: else Aa(Adiag(i)) += dv(i);
1080: }));
1081: PetscCall(VecRestoreKokkosView(D, &dv));
1082: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1083: PetscCall(PetscLogGpuFlops(n));
1084: PetscCall(PetscLogGpuTimeEnd());
1085: } else { // need reassembly, very slow!
1086: PetscCall(MatDiagonalSet_Default(Y, D, is));
1087: }
1088: PetscFunctionReturn(PETSC_SUCCESS);
1089: }
1091: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1092: {
1093: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1094: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1095: ConstPetscScalarKokkosView lv, rv;
1097: PetscFunctionBegin;
1098: PetscCall(PetscLogGpuTimeBegin());
1099: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1100: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1101: const auto &Aa = aijkok->a_dual.view_device();
1102: const auto &Ai = aijkok->i_dual.view_device();
1103: const auto &Aj = aijkok->j_dual.view_device();
1104: if (ll) {
1105: PetscCall(VecGetLocalSize(ll, &m));
1106: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1107: PetscCall(VecGetKokkosView(ll, &lv));
1108: PetscCallCXX(Kokkos::parallel_for( // for each row
1109: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1110: PetscInt i = t.league_rank(); // row i
1111: PetscInt len = Ai(i + 1) - Ai(i);
1112: // scale entries on the row
1113: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1114: }));
1115: PetscCall(VecRestoreKokkosView(ll, &lv));
1116: PetscCall(PetscLogGpuFlops(nz));
1117: }
1118: if (rr) {
1119: PetscCall(VecGetLocalSize(rr, &n));
1120: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1121: PetscCall(VecGetKokkosView(rr, &rv));
1122: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1123: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1124: PetscCall(VecRestoreKokkosView(rr, &lv));
1125: PetscCall(PetscLogGpuFlops(nz));
1126: }
1127: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1128: PetscCall(PetscLogGpuTimeEnd());
1129: PetscFunctionReturn(PETSC_SUCCESS);
1130: }
1132: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1133: {
1134: Mat_SeqAIJKokkos *aijkok;
1136: PetscFunctionBegin;
1137: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1138: if (aijkok) { /* Only zero the device if data is already there */
1139: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1140: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1141: } else { /* Might be preallocated but not assembled */
1142: PetscCall(MatZeroEntries_SeqAIJ(A));
1143: }
1144: PetscFunctionReturn(PETSC_SUCCESS);
1145: }
1147: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1148: {
1149: Mat_SeqAIJKokkos *aijkok;
1150: PetscInt n;
1151: PetscScalarKokkosView xv;
1153: PetscFunctionBegin;
1154: PetscCall(VecGetLocalSize(x, &n));
1155: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1156: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1158: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1159: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1161: const auto &Aa = aijkok->a_dual.view_device();
1162: const auto &Ai = aijkok->i_dual.view_device();
1163: const auto &Adiag = aijkok->diag_dual.view_device();
1165: PetscCall(VecGetKokkosViewWrite(x, &xv));
1166: Kokkos::parallel_for(
1167: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1168: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1169: else xv(i) = 0;
1170: });
1171: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1172: PetscFunctionReturn(PETSC_SUCCESS);
1173: }
1175: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1176: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1177: {
1178: Mat_SeqAIJKokkos *aijkok;
1180: PetscFunctionBegin;
1182: PetscAssertPointer(kv, 2);
1183: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1185: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1186: *kv = aijkok->a_dual.view_device();
1187: PetscFunctionReturn(PETSC_SUCCESS);
1188: }
1190: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1191: {
1192: PetscFunctionBegin;
1194: PetscAssertPointer(kv, 2);
1195: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1196: PetscFunctionReturn(PETSC_SUCCESS);
1197: }
1199: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1200: {
1201: Mat_SeqAIJKokkos *aijkok;
1203: PetscFunctionBegin;
1205: PetscAssertPointer(kv, 2);
1206: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1208: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1209: *kv = aijkok->a_dual.view_device();
1210: PetscFunctionReturn(PETSC_SUCCESS);
1211: }
1213: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1214: {
1215: PetscFunctionBegin;
1217: PetscAssertPointer(kv, 2);
1218: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1219: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1220: PetscFunctionReturn(PETSC_SUCCESS);
1221: }
1223: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1224: {
1225: Mat_SeqAIJKokkos *aijkok;
1227: PetscFunctionBegin;
1229: PetscAssertPointer(kv, 2);
1230: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1231: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1232: *kv = aijkok->a_dual.view_device();
1233: PetscFunctionReturn(PETSC_SUCCESS);
1234: }
1236: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1237: {
1238: PetscFunctionBegin;
1240: PetscAssertPointer(kv, 2);
1241: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1242: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1243: PetscFunctionReturn(PETSC_SUCCESS);
1244: }
1246: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1247: {
1248: Mat_SeqAIJKokkos *akok;
1250: PetscFunctionBegin;
1251: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1252: PetscCall(MatCreate(comm, A));
1253: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1254: PetscFunctionReturn(PETSC_SUCCESS);
1255: }
1257: /* Computes Y += alpha X */
1258: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1259: {
1260: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1261: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1262: ConstMatScalarKokkosView Xa;
1263: MatScalarKokkosView Ya;
1264: auto exec = PetscGetKokkosExecutionSpace();
1266: PetscFunctionBegin;
1267: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1268: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1269: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1270: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1271: PetscCall(PetscLogGpuTimeBegin());
1273: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1274: PetscBool e;
1275: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1276: if (e) {
1277: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1278: if (e) pattern = SAME_NONZERO_PATTERN;
1279: }
1280: }
1282: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1283: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1284: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1285: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1286: */
1287: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1288: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1289: Xa = xkok->a_dual.view_device();
1290: Ya = ykok->a_dual.view_device();
1292: if (pattern == SAME_NONZERO_PATTERN) {
1293: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1294: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1295: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1296: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1297: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1299: Kokkos::parallel_for(
1300: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1301: PetscInt i = t.league_rank(); // row i
1302: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1303: // Only one thread works in a team
1304: PetscInt p, q = Yi(i);
1305: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1306: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1307: if (Xj(p) == Yj(q)) { // Found it
1308: Ya(q) += alpha * Xa(p);
1309: q++;
1310: } else {
1311: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1312: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1313: #if PETSC_PKG_KOKKOS_VERSION_GE(5, 0, 0)
1314: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = KokkosKernels::ArithTraits<PetscScalar>::nan();
1315: #elif PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1316: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1317: #else
1318: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1319: #endif
1320: }
1321: }
1322: });
1323: });
1324: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1325: } else { // different nonzero patterns
1326: Mat Z;
1327: KokkosCsrMatrix zcsr;
1328: KernelHandle kh;
1329: kh.create_spadd_handle(true); // X, Y are sorted
1330: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1331: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1332: zkok = new Mat_SeqAIJKokkos(zcsr);
1333: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1334: PetscCall(MatHeaderReplace(Y, &Z));
1335: kh.destroy_spadd_handle();
1336: }
1337: PetscCall(PetscLogGpuTimeEnd());
1338: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1339: PetscFunctionReturn(PETSC_SUCCESS);
1340: }
1342: struct MatCOOStruct_SeqAIJKokkos {
1343: PetscCount n;
1344: PetscCount Atot;
1345: PetscInt nz;
1346: PetscCountKokkosView jmap;
1347: PetscCountKokkosView perm;
1349: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1350: {
1351: nz = coo_h->nz;
1352: n = coo_h->n;
1353: Atot = coo_h->Atot;
1354: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1355: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1356: }
1357: };
1359: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(PetscCtxRt data)
1360: {
1361: PetscFunctionBegin;
1362: PetscCallCXX(delete *static_cast<MatCOOStruct_SeqAIJKokkos **>(data));
1363: PetscFunctionReturn(PETSC_SUCCESS);
1364: }
1366: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1367: {
1368: Mat_SeqAIJKokkos *akok;
1369: Mat_SeqAIJ *aseq;
1370: PetscContainer container_h;
1371: MatCOOStruct_SeqAIJ *coo_h;
1372: MatCOOStruct_SeqAIJKokkos *coo_d;
1374: PetscFunctionBegin;
1375: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1376: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1377: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1378: delete akok;
1379: mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1380: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1382: // Copy the COO struct to device
1383: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1384: PetscCall(PetscContainerGetPointer(container_h, &coo_h));
1385: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1387: // Put the COO struct in a container and then attach that to the matrix
1388: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1389: PetscFunctionReturn(PETSC_SUCCESS);
1390: }
1392: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1393: {
1394: MatScalarKokkosView Aa;
1395: ConstMatScalarKokkosView kv;
1396: PetscMemType memtype;
1397: PetscContainer container;
1398: MatCOOStruct_SeqAIJKokkos *coo;
1400: PetscFunctionBegin;
1401: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1402: PetscCall(PetscContainerGetPointer(container, &coo));
1404: const auto &n = coo->n;
1405: const auto &Annz = coo->nz;
1406: const auto &jmap = coo->jmap;
1407: const auto &perm = coo->perm;
1409: PetscCall(PetscGetMemType(v, &memtype));
1410: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1411: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1412: } else {
1413: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1414: }
1416: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1417: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1419: PetscCall(PetscLogGpuTimeBegin());
1420: Kokkos::parallel_for(
1421: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1422: PetscScalar sum = 0.0;
1423: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1424: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1425: });
1426: PetscCall(PetscLogGpuTimeEnd());
1428: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1429: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1430: PetscFunctionReturn(PETSC_SUCCESS);
1431: }
1433: PETSC_INTERN PetscErrorCode MatBindToCPU_SeqAIJKokkos(Mat A, PetscBool flg)
1434: {
1435: PetscFunctionBegin;
1436: PetscCheck(A->boundtocpu == flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Changing binding of a MATAIJKOKKOS matrix is not supported yet");
1437: PetscFunctionReturn(PETSC_SUCCESS);
1438: }
1440: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1441: {
1442: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1444: PetscFunctionBegin;
1445: A->offloadmask = PETSC_OFFLOAD_KOKKOS; // We do not really use this flag
1446: A->boundtocpu = PetscDefined(HAVE_KOKKOS_WITHOUT_GPU) ? PETSC_TRUE : PETSC_FALSE; // MATAIJKOKKOS has yet to support CPU binding. But in this case, we deem it is bound to CPU.
1447: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1448: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1449: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1450: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1451: A->ops->scale = MatScale_SeqAIJKokkos;
1452: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1453: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1454: A->ops->mult = MatMult_SeqAIJKokkos;
1455: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1456: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1457: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1458: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1459: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1460: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1461: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1462: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1463: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1464: A->ops->shift = MatShift_SeqAIJKokkos;
1465: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1466: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1467: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos;
1468: A->ops->bindtocpu = MatBindToCPU_SeqAIJKokkos;
1469: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1470: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1471: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1472: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1473: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1474: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1475: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1477: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1478: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1479: #if defined(PETSC_HAVE_HYPRE)
1480: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1481: #endif
1482: PetscFunctionReturn(PETSC_SUCCESS);
1483: }
1485: /*
1486: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1488: Input Parameters:
1489: + A - the MATSEQAIJKOKKOS matrix
1490: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1491: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1492: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1493: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1495: Output Parameter:
1496: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1497: */
1498: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1499: {
1500: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1501: PetscInt nblocks = bs.extent(0) - 1;
1503: PetscFunctionBegin;
1504: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1506: // Pull out the diagonal blocks of the matrix and then invert the blocks
1507: auto Aa = akok->a_dual.view_device();
1508: auto Ai = akok->i_dual.view_device();
1509: auto Aj = akok->j_dual.view_device();
1510: auto Adiag = akok->diag_dual.view_device();
1511: // TODO: how to tune the team size?
1512: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1513: auto ts = Kokkos::AUTO();
1514: #else
1515: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1516: #endif
1517: PetscCallCXX(Kokkos::parallel_for(
1518: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1519: const PetscInt bid = teamMember.league_rank(); // block id
1520: const PetscInt rstart = bs(bid); // this block starts from this row
1521: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1522: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1523: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1525: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1526: PetscInt i = rstart + r; // i-th row in A
1528: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1529: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1531: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1532: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1533: B(r, c) = 0.0;
1534: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1535: B(r, c) = Aa(first + c);
1536: } else { // this entry does not show up in the CSR
1537: B(r, c) = 0.0;
1538: }
1539: }
1540: } else { // rare case that the diagonal does not exist
1541: const PetscInt begin = Ai(i);
1542: const PetscInt end = Ai(i + 1);
1543: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1544: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1545: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1546: else if (Aj(j) >= rstart + m) break;
1547: }
1548: }
1549: });
1551: // LU-decompose B (w/o pivoting) and then invert B
1552: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1553: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1554: }));
1555: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1556: PetscFunctionReturn(PETSC_SUCCESS);
1557: }
1559: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1560: {
1561: Mat_SeqAIJ *aseq;
1562: PetscInt i, m, n;
1563: auto exec = PetscGetKokkosExecutionSpace();
1565: PetscFunctionBegin;
1566: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1568: m = akok->nrows();
1569: n = akok->ncols();
1570: PetscCall(MatSetSizes(A, m, n, m, n));
1571: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1573: /* Set up data structures of A as a MATSEQAIJ */
1574: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1575: aseq = (Mat_SeqAIJ *)A->data;
1577: PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1578: PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1580: aseq->i = akok->i_host_data();
1581: aseq->j = akok->j_host_data();
1582: aseq->a = akok->a_host_data();
1583: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1584: aseq->free_a = PETSC_FALSE;
1585: aseq->free_ij = PETSC_FALSE;
1586: aseq->nz = akok->nnz();
1587: aseq->maxnz = aseq->nz;
1589: PetscCall(PetscMalloc1(m, &aseq->imax));
1590: PetscCall(PetscMalloc1(m, &aseq->ilen));
1591: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1593: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1594: akok->nonzerostate = A->nonzerostate;
1595: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1596: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1597: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1598: PetscFunctionReturn(PETSC_SUCCESS);
1599: }
1601: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1602: {
1603: PetscFunctionBegin;
1604: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1605: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1606: PetscFunctionReturn(PETSC_SUCCESS);
1607: }
1609: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1610: {
1611: Mat_SeqAIJKokkos *akok;
1613: PetscFunctionBegin;
1614: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1615: PetscCall(MatCreate(comm, A));
1616: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1617: PetscFunctionReturn(PETSC_SUCCESS);
1618: }
1620: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1622: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1623: */
1624: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1625: {
1626: PetscFunctionBegin;
1627: PetscCall(MatCreate(comm, A));
1628: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1629: PetscFunctionReturn(PETSC_SUCCESS);
1630: }
1632: /*@C
1633: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1634: (the default parallel PETSc format). This matrix will ultimately be handled by
1635: Kokkos for calculations.
1637: Collective
1639: Input Parameters:
1640: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1641: . m - number of rows
1642: . n - number of columns
1643: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1644: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1646: Output Parameter:
1647: . A - the matrix
1649: Level: intermediate
1651: Notes:
1652: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1653: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1654: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1656: The AIJ format, also called
1657: compressed row storage, is fully compatible with standard Fortran
1658: storage. That is, the stored row and column indices can begin at
1659: either one (as in Fortran) or zero.
1661: Specify the preallocated storage with either `nz` or `nnz` (not both).
1662: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1663: allocation.
1665: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1666: @*/
1667: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1668: {
1669: PetscFunctionBegin;
1670: PetscCall(PetscKokkosInitializeCheck());
1671: PetscCall(MatCreate(comm, A));
1672: PetscCall(MatSetSizes(*A, m, n, m, n));
1673: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1674: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1675: PetscFunctionReturn(PETSC_SUCCESS);
1676: }
1678: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1679: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1680: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1681: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1682: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1683: {
1684: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1685: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1686: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1688: PetscFunctionBegin;
1689: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1690: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1691: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1692: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1693: }
1694: PetscFunctionReturn(PETSC_SUCCESS);
1695: }
1697: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1698: {
1699: const PetscInt n = A->rmap->n;
1700: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1701: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1702: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1704: PetscFunctionBegin;
1705: if (!factors->transpose_updated) {
1706: if (has_upper) {
1707: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1708: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1709: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1710: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1711: }
1713: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1714: if (!factors->U) {
1715: Mat_SeqAIJ *seq;
1717: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1718: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1720: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1721: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1722: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1723: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1724: } else {
1725: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1726: }
1727: // Copy Ut from host to device
1728: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1729: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1730: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1731: } else { // If U was computed on device, we also compute the transpose there
1732: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1733: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1734: factors->jU_d, factors->aU_d,
1735: factors->iUt_d, factors->jUt_d,
1736: factors->aUt_d));
1737: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1738: }
1739: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1740: }
1742: // do the same for L with LU
1743: if (has_lower) {
1744: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1745: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1746: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1747: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1748: }
1750: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1751: if (!factors->L) {
1752: Mat_SeqAIJ *seq;
1754: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1755: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1757: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1758: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1759: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1760: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1761: } else {
1762: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1763: }
1764: // Copy Lt from host to device
1765: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1766: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1767: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1768: } else { // If L was computed on device, we also compute the transpose there
1769: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1770: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1771: factors->jL_d, factors->aL_d,
1772: factors->iLt_d, factors->jLt_d,
1773: factors->aLt_d));
1774: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1775: }
1776: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1777: }
1779: factors->transpose_updated = PETSC_TRUE;
1780: }
1781: PetscFunctionReturn(PETSC_SUCCESS);
1782: }
1784: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1785: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1786: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1787: {
1788: auto exec = PetscGetKokkosExecutionSpace();
1789: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1790: PetscInt m = A->rmap->n;
1791: PetscScalarKokkosView D = factors->D_d;
1792: PetscScalarKokkosView X, Y, B; // alias
1793: ConstPetscScalarKokkosView b;
1794: PetscScalarKokkosView x;
1795: PetscIntKokkosView &rowperm = factors->rowperm;
1796: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1798: PetscFunctionBegin;
1799: PetscCall(PetscLogGpuTimeBegin());
1800: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1801: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1802: PetscCall(VecGetKokkosView(bb, &b));
1803: PetscCall(VecGetKokkosViewWrite(xx, &x));
1805: // Solve U^T Y = B
1806: if (identity) { // Reorder b with the row permutation
1807: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1808: Y = factors->workVector;
1809: } else {
1810: B = factors->workVector;
1811: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1812: Y = x;
1813: }
1814: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1816: // Solve diag(D) Y' = Y.
1817: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1818: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1820: // Solve UX = Y
1821: if (identity) {
1822: X = x;
1823: } else {
1824: X = factors->workVector; // B is not needed anymore
1825: }
1826: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1828: // Reorder X with the inverse column (row) permutation
1829: if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1831: PetscCall(VecRestoreKokkosView(bb, &b));
1832: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1833: PetscCall(PetscLogGpuTimeEnd());
1834: PetscFunctionReturn(PETSC_SUCCESS);
1835: }
1837: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1838: // R and C are represented by rowperm and colperm in factors.
1839: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1840: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1841: {
1842: auto exec = PetscGetKokkosExecutionSpace();
1843: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1844: PetscInt m = A->rmap->n;
1845: PetscScalarKokkosView X, Y, B; // alias
1846: ConstPetscScalarKokkosView b;
1847: PetscScalarKokkosView x;
1848: PetscIntKokkosView &rowperm = factors->rowperm;
1849: PetscIntKokkosView &colperm = factors->colperm;
1850: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1851: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1853: PetscFunctionBegin;
1854: PetscCall(PetscLogGpuTimeBegin());
1855: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1856: PetscCall(VecGetKokkosView(bb, &b));
1857: PetscCall(VecGetKokkosViewWrite(xx, &x));
1859: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1860: if (row_identity) {
1861: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1862: Y = factors->workVector;
1863: } else {
1864: B = factors->workVector;
1865: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1866: Y = x;
1867: }
1868: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1870: // Solve U C^- x = Y
1871: if (col_identity) {
1872: X = x;
1873: } else {
1874: X = factors->workVector;
1875: }
1876: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1878: // x = C X; Reorder X with the inverse col permutation
1879: if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1881: PetscCall(VecRestoreKokkosView(bb, &b));
1882: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1883: PetscCall(PetscLogGpuTimeEnd());
1884: PetscFunctionReturn(PETSC_SUCCESS);
1885: }
1887: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1888: // R and C are represented by rowperm and colperm in factors.
1889: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1890: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1891: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1892: {
1893: auto exec = PetscGetKokkosExecutionSpace();
1894: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1895: PetscInt m = A->rmap->n;
1896: PetscScalarKokkosView X, Y, B; // alias
1897: ConstPetscScalarKokkosView b;
1898: PetscScalarKokkosView x;
1899: PetscIntKokkosView &rowperm = factors->rowperm;
1900: PetscIntKokkosView &colperm = factors->colperm;
1901: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1902: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1904: PetscFunctionBegin;
1905: PetscCall(PetscLogGpuTimeBegin());
1906: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1907: PetscCall(VecGetKokkosView(bb, &b));
1908: PetscCall(VecGetKokkosViewWrite(xx, &x));
1910: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1911: if (col_identity) { // Reorder b with the col permutation
1912: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1913: Y = factors->workVector;
1914: } else {
1915: B = factors->workVector;
1916: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1917: Y = x;
1918: }
1919: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1921: // Solve L^T X = Y
1922: if (row_identity) {
1923: X = x;
1924: } else {
1925: X = factors->workVector;
1926: }
1927: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1929: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1930: if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1932: PetscCall(VecRestoreKokkosView(bb, &b));
1933: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1934: PetscCall(PetscLogGpuTimeEnd());
1935: PetscFunctionReturn(PETSC_SUCCESS);
1936: }
1938: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1939: {
1940: PetscFunctionBegin;
1941: PetscCall(MatSeqAIJKokkosSyncHost(A));
1942: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1944: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1945: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1946: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1947: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1948: const MatScalar *Ba = b->a;
1949: PetscInt m = B->rmap->n, n = B->cmap->n;
1951: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1952: // Allocate memory and copy the structure
1953: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1954: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1955: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1956: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1957: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1958: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1960: PetscInt *Li = factors->iL_h.data();
1961: PetscInt *Lj = factors->jL_h.data();
1962: PetscInt *Ui = factors->iU_h.data();
1963: PetscInt *Uj = factors->jU_h.data();
1965: Li[0] = Ui[0] = 0;
1966: for (PetscInt i = 0; i < m; i++) {
1967: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1968: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1970: PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1971: Lj[Li[i] + llen] = i; // diagonal entry of L
1973: Uj[Ui[i]] = i; // diagonal entry of U
1974: PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
1976: Li[i + 1] = Li[i] + llen + 1;
1977: Ui[i + 1] = Ui[i] + ulen;
1978: }
1980: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1981: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1982: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1983: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1984: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1985: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1987: // Copy row/col permutation to device
1988: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1989: PetscBool row_identity;
1990: PetscCall(ISIdentity(rowperm, &row_identity));
1991: if (!row_identity) {
1992: const PetscInt *ip;
1994: PetscCall(ISGetIndices(rowperm, &ip));
1995: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1996: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1997: PetscCall(ISRestoreIndices(rowperm, &ip));
1998: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1999: }
2001: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
2002: PetscBool col_identity;
2003: PetscCall(ISIdentity(colperm, &col_identity));
2004: if (!col_identity) {
2005: const PetscInt *ip;
2007: PetscCall(ISGetIndices(colperm, &ip));
2008: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2009: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2010: PetscCall(ISRestoreIndices(colperm, &ip));
2011: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2012: }
2014: /* Create sptrsv handles for L, U and their transpose */
2015: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2016: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2017: #else
2018: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2019: #endif
2020: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2021: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2022: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2023: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2024: }
2026: // Copy the value
2027: for (PetscInt i = 0; i < m; i++) {
2028: PetscInt llen = Bi[i + 1] - Bi[i];
2029: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2030: const PetscInt *Li = factors->iL_h.data();
2031: const PetscInt *Ui = factors->iU_h.data();
2033: PetscScalar *La = factors->aL_h.data();
2034: PetscScalar *Ua = factors->aU_h.data();
2036: PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2037: La[Li[i] + llen] = 1.0; // diagonal entry
2039: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2040: PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2041: }
2043: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2044: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2045: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2046: factors->transpose_updated = PETSC_FALSE;
2047: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2049: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2050: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2051: }
2053: B->ops->matsolve = NULL;
2054: B->ops->matsolvetranspose = NULL;
2055: PetscFunctionReturn(PETSC_SUCCESS);
2056: }
2058: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2059: {
2060: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2061: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2062: PetscInt fill_lev = info->levels;
2064: PetscFunctionBegin;
2065: PetscCall(PetscLogGpuTimeBegin());
2066: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2067: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2069: auto a_d = aijkok->a_dual.view_device();
2070: auto i_d = aijkok->i_dual.view_device();
2071: auto j_d = aijkok->j_dual.view_device();
2073: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2075: B->assembled = PETSC_TRUE;
2076: B->preallocated = PETSC_TRUE;
2077: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2078: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2079: B->ops->matsolve = NULL;
2080: B->ops->matsolvetranspose = NULL;
2082: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2083: factors->transpose_updated = PETSC_FALSE;
2084: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2085: /* TODO: log flops, but how to know that? */
2086: PetscCall(PetscLogGpuTimeEnd());
2087: PetscFunctionReturn(PETSC_SUCCESS);
2088: }
2090: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2091: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2092: {
2093: Mat_SeqAIJKokkos *aijkok;
2094: Mat_SeqAIJ *b;
2095: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2096: PetscInt fill_lev = info->levels;
2097: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2098: PetscInt n = A->rmap->n;
2100: PetscFunctionBegin;
2101: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2102: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2104: /* Create a spiluk handle and then do symbolic factorization */
2105: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2106: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2108: auto spiluk_handle = factors->kh.get_spiluk_handle();
2110: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2111: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2112: Kokkos::realloc(factors->iU_d, n + 1);
2113: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2115: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2116: auto i_d = aijkok->i_dual.view_device();
2117: auto j_d = aijkok->j_dual.view_device();
2118: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2119: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2121: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2122: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2123: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2124: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2126: /* TODO: add options to select sptrsv algorithms */
2127: /* Create sptrsv handles for L, U and their transpose */
2128: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2129: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2130: #else
2131: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2132: #endif
2134: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2135: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2136: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2137: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2139: /* Fill fields of the factor matrix B */
2140: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2141: b = (Mat_SeqAIJ *)B->data;
2142: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2143: B->info.fill_ratio_given = info->fill;
2144: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2146: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2147: PetscFunctionReturn(PETSC_SUCCESS);
2148: }
2150: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151: {
2152: PetscFunctionBegin;
2153: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2154: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2155: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2156: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2157: PetscFunctionReturn(PETSC_SUCCESS);
2158: }
2160: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2161: {
2162: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2164: PetscFunctionBegin;
2165: if (!info->factoronhost) {
2166: PetscCall(ISIdentity(isrow, &row_identity));
2167: PetscCall(ISIdentity(iscol, &col_identity));
2168: }
2170: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2171: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2173: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2174: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2175: } else {
2176: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2177: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2178: }
2179: PetscFunctionReturn(PETSC_SUCCESS);
2180: }
2182: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2183: {
2184: PetscFunctionBegin;
2185: PetscCall(MatSeqAIJKokkosSyncHost(A));
2186: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2188: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2189: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2190: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2191: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2192: const MatScalar *Ba = b->a;
2193: PetscInt m = B->rmap->n;
2195: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2196: // Allocate memory and copy the structure
2197: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2198: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2199: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2200: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2201: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2202: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2204: // Build jU_h from the skewed Aj
2205: PetscInt *Uj = factors->jU_h.data();
2206: for (PetscInt i = 0; i < m; i++) {
2207: PetscInt ulen = Bi[i + 1] - Bi[i];
2208: Uj[Bi[i]] = i; // diagonal entry
2209: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2210: }
2212: // Copy iU, jU to device
2213: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2214: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2216: // Copy row/col permutation to device
2217: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2218: PetscBool row_identity;
2219: PetscCall(ISIdentity(rowperm, &row_identity));
2220: if (!row_identity) {
2221: const PetscInt *ip;
2223: PetscCall(ISGetIndices(rowperm, &ip));
2224: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2225: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2226: PetscCall(ISRestoreIndices(rowperm, &ip));
2227: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2228: }
2230: // Create sptrsv handles for U and U^T
2231: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2232: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2233: #else
2234: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2235: #endif
2236: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2237: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2238: }
2239: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2240: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2241: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2243: // Copy the value
2244: PetscScalar *Ua = factors->aU_h.data();
2245: PetscScalar *D = factors->D_h.data();
2246: for (PetscInt i = 0; i < m; i++) {
2247: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2248: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2249: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2250: }
2251: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2252: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2254: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2255: factors->transpose_updated = PETSC_FALSE;
2256: }
2258: B->ops->matsolve = NULL;
2259: B->ops->matsolvetranspose = NULL;
2260: PetscFunctionReturn(PETSC_SUCCESS);
2261: }
2263: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2264: {
2265: PetscFunctionBegin;
2266: if (info->solveonhost) {
2267: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2268: PetscCall(MatSetType(B, MATSEQSBAIJ));
2269: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2270: }
2272: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2274: if (!info->solveonhost) {
2275: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2276: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2277: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2278: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2279: }
2280: PetscFunctionReturn(PETSC_SUCCESS);
2281: }
2283: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2284: {
2285: PetscFunctionBegin;
2286: if (info->solveonhost) {
2287: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2288: PetscCall(MatSetType(B, MATSEQSBAIJ));
2289: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2290: }
2292: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2294: if (!info->solveonhost) {
2295: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2296: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2297: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2298: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2299: }
2300: PetscFunctionReturn(PETSC_SUCCESS);
2301: }
2303: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2304: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2305: {
2306: PetscFunctionBegin;
2307: *type = MATSOLVERKOKKOS;
2308: PetscFunctionReturn(PETSC_SUCCESS);
2309: }
2311: /*MC
2312: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2313: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2315: Level: beginner
2317: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2318: M*/
2319: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2320: {
2321: PetscInt n = A->rmap->n;
2322: MPI_Comm comm;
2324: PetscFunctionBegin;
2325: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2326: PetscCall(MatCreate(comm, B));
2327: PetscCall(MatSetSizes(*B, n, n, n, n));
2328: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2329: (*B)->factortype = ftype;
2330: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2331: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2332: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2334: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2335: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2336: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2337: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2338: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2339: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2340: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2341: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2342: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2343: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2344: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2345: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2347: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2348: (*B)->canuseordering = PETSC_TRUE;
2349: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2350: PetscFunctionReturn(PETSC_SUCCESS);
2351: }
2353: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2354: {
2355: PetscFunctionBegin;
2356: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2357: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2358: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2359: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2360: PetscFunctionReturn(PETSC_SUCCESS);
2361: }
2363: /* Utility to print out a KokkosCsrMatrix for debugging */
2364: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2365: {
2366: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2367: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2368: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2369: const PetscInt *i = iv.data();
2370: const PetscInt *j = jv.data();
2371: const PetscScalar *a = av.data();
2372: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2374: PetscFunctionBegin;
2375: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2376: for (PetscInt k = 0; k < m; k++) {
2377: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2378: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2379: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2380: }
2381: PetscFunctionReturn(PETSC_SUCCESS);
2382: }