Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsys.h>
10: #include <Kokkos_Core.hpp>
11: #include <KokkosBlas.hpp>
12: #include <KokkosSparse_CrsMatrix.hpp>
14: // To suppress compiler warnings:
15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20: #include <KokkosSparse_spmv.hpp>
21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
23: #include <KokkosSparse_spiluk.hpp>
24: #include <KokkosSparse_sptrsv.hpp>
25: #include <KokkosSparse_spgemm.hpp>
26: #include <KokkosSparse_spadd.hpp>
27: #include <KokkosBatched_LU_Decl.hpp>
28: #include <KokkosBatched_InverseLU_Decl.hpp>
30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33: #include <KokkosSparse_Utils.hpp>
34: using KokkosSparse::sort_crs_matrix;
35: using KokkosSparse::Impl::transpose_matrix;
36: #else
37: #include <KokkosKernels_Sorting.hpp>
38: using KokkosKernels::sort_crs_matrix;
39: using KokkosKernels::Impl::transpose_matrix;
40: #endif
42: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43: using KokkosSparse::spiluk_symbolic;
44: using KokkosSparse::spiluk_numeric;
45: using KokkosSparse::sptrsv_symbolic;
46: using KokkosSparse::sptrsv_solve;
47: using KokkosSparse::Experimental::SPTRSVAlgorithm;
48: using KokkosSparse::Experimental::SPILUKAlgorithm;
49: #else
50: using KokkosSparse::Experimental::spiluk_symbolic;
51: using KokkosSparse::Experimental::spiluk_numeric;
52: using KokkosSparse::Experimental::sptrsv_symbolic;
53: using KokkosSparse::Experimental::sptrsv_solve;
54: using KokkosSparse::Experimental::SPTRSVAlgorithm;
55: using KokkosSparse::Experimental::SPILUKAlgorithm;
56: #endif
58: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
60: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62: In the latter case, it is important to set a_dual's sync state correctly.
63: */
64: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65: {
66: Mat_SeqAIJ *aijseq;
67: Mat_SeqAIJKokkos *aijkok;
69: PetscFunctionBegin;
70: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
73: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
76: /* If aijkok does not exist, we just copy i, j to device.
77: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78: In both cases, we build a new aijkok structure.
79: */
80: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81: if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidently freeing much needed a,i,j on host when deleting aijkok */
82: PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85: PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86: PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87: PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88: aijseq->free_a = PETSC_TRUE;
89: aijseq->free_ij = PETSC_TRUE;
90: /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91: we have the CsrMatrix on device first and then copy to host, followed by
92: MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93: One could improve it by not using NULL garray.
94: */
95: }
96: delete aijkok;
97: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98: A->spptr = aijkok;
99: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
103: }
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: /* Sync CSR data to device if not yet */
108: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112: PetscFunctionBegin;
113: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115: if (aijkok->a_dual.need_sync_device()) {
116: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118: aijkok->hermitian_updated = PETSC_FALSE;
119: }
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /* Mark the CSR data on device as modified */
124: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125: {
126: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
128: PetscFunctionBegin;
129: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130: aijkok->a_dual.clear_sync_state();
131: aijkok->a_dual.modify_device();
132: aijkok->transpose_updated = PETSC_FALSE;
133: aijkok->hermitian_updated = PETSC_FALSE;
134: PetscCall(MatSeqAIJInvalidateDiagonal(A));
135: PetscCall(PetscObjectStateIncrease((PetscObject)A));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140: {
141: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142: auto exec = PetscGetKokkosExecutionSpace();
144: PetscFunctionBegin;
145: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146: /* We do not expect one needs factors on host */
147: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154: {
155: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157: PetscFunctionBegin;
158: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162: */
163: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165: *array = aijkok->a_dual.view_host().data();
166: } else { /* Happens when calling MatSetValues on a newly created matrix */
167: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168: }
169: PetscFunctionReturn(PETSC_SUCCESS);
170: }
172: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173: {
174: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176: PetscFunctionBegin;
177: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178: PetscFunctionReturn(PETSC_SUCCESS);
179: }
181: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182: {
183: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185: PetscFunctionBegin;
186: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188: *array = aijkok->a_dual.view_host().data();
189: } else {
190: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191: }
192: PetscFunctionReturn(PETSC_SUCCESS);
193: }
195: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196: {
197: PetscFunctionBegin;
198: *array = NULL;
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203: {
204: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206: PetscFunctionBegin;
207: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208: *array = aijkok->a_dual.view_host().data();
209: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211: }
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216: {
217: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219: PetscFunctionBegin;
220: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221: aijkok->a_dual.clear_sync_state();
222: aijkok->a_dual.modify_host();
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228: {
229: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231: PetscFunctionBegin;
232: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234: if (i) *i = aijkok->i_device_data();
235: if (j) *j = aijkok->j_device_data();
236: if (a) {
237: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238: *a = aijkok->a_device_data();
239: }
240: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241: PetscFunctionReturn(PETSC_SUCCESS);
242: }
244: /*
245: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
247: Input Parameter:
248: . A - the MATSEQAIJKOKKOS matrix
250: Output Parameters:
251: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252: - T_d - the transpose on device, whose value array is allocated but not initialized
253: */
254: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255: {
256: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
259: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260: MatRowMapType *Ti = Ti_h.data();
261: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263: PetscInt *Tj = Tj_h.data();
264: PetscInt *perm = perm_h.data();
265: PetscInt *offset;
267: PetscFunctionBegin;
268: // Populate Ti
269: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270: Ti++;
271: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272: Ti--;
273: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
275: // Populate Tj and the permutation array
276: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277: for (PetscInt i = 0; i < m; i++) {
278: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279: PetscInt r = Aj[j]; // row r of T
280: PetscInt disp = Ti[r] + offset[r];
282: Tj[disp] = i; // col i of T
283: perm[disp] = j;
284: offset[r]++;
285: }
286: }
287: PetscCall(PetscFree(offset));
289: // Sort each row of T, along with the permutation array
290: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
292: // Output perm and T on device
293: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297: PetscFunctionReturn(PETSC_SUCCESS);
298: }
300: // Generate the transpose on device and cache it internally
301: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303: {
304: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307: KokkosCsrMatrix &T = akok->csrmatT;
309: PetscFunctionBegin;
310: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
313: const auto &Aa = akok->a_dual.view_device();
315: if (A->symmetric == PETSC_BOOL3_TRUE) {
316: *csrmatT = akok->csrmat;
317: } else {
318: // See if we already have a cached transpose and its value is up to date
319: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
321: const auto &perm = akok->transpose_perm; // get the permutation array
322: auto &Ta = T.values;
324: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325: }
326: } else { // Generate T of size n x m for the first time
327: MatRowMapKokkosView perm;
329: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332: }
333: akok->transpose_updated = PETSC_TRUE;
334: *csrmatT = akok->csrmatT;
335: }
336: PetscFunctionReturn(PETSC_SUCCESS);
337: }
339: // Generate the Hermitian on device and cache it internally
340: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341: {
342: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345: KokkosCsrMatrix &T = akok->csrmatH;
347: PetscFunctionBegin;
348: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
351: const auto &Aa = akok->a_dual.view_device();
353: if (A->hermitian == PETSC_BOOL3_TRUE) {
354: *csrmatH = akok->csrmat;
355: } else {
356: // See if we already have a cached hermitian and its value is up to date
357: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
359: const auto &perm = akok->transpose_perm; // get the permutation array
360: auto &Ta = T.values;
362: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363: }
364: } else { // Generate T of size n x m for the first time
365: MatRowMapKokkosView perm;
367: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370: }
371: akok->hermitian_updated = PETSC_TRUE;
372: *csrmatH = akok->csrmatH;
373: }
374: PetscFunctionReturn(PETSC_SUCCESS);
375: }
377: /* y = A x */
378: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379: {
380: Mat_SeqAIJKokkos *aijkok;
381: ConstPetscScalarKokkosView xv;
382: PetscScalarKokkosView yv;
384: PetscFunctionBegin;
385: PetscCall(PetscLogGpuTimeBegin());
386: PetscCall(MatSeqAIJKokkosSyncDevice(A));
387: PetscCall(VecGetKokkosView(xx, &xv));
388: PetscCall(VecGetKokkosViewWrite(yy, &yv));
389: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391: PetscCall(VecRestoreKokkosView(xx, &xv));
392: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395: PetscCall(PetscLogGpuTimeEnd());
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: /* y = A^T x */
400: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401: {
402: Mat_SeqAIJKokkos *aijkok;
403: const char *mode;
404: ConstPetscScalarKokkosView xv;
405: PetscScalarKokkosView yv;
406: KokkosCsrMatrix csrmat;
408: PetscFunctionBegin;
409: PetscCall(PetscLogGpuTimeBegin());
410: PetscCall(MatSeqAIJKokkosSyncDevice(A));
411: PetscCall(VecGetKokkosView(xx, &xv));
412: PetscCall(VecGetKokkosViewWrite(yy, &yv));
413: if (A->form_explicit_transpose) {
414: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415: mode = "N";
416: } else {
417: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418: csrmat = aijkok->csrmat;
419: mode = "T";
420: }
421: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422: PetscCall(VecRestoreKokkosView(xx, &xv));
423: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425: PetscCall(PetscLogGpuTimeEnd());
426: PetscFunctionReturn(PETSC_SUCCESS);
427: }
429: /* y = A^H x */
430: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431: {
432: Mat_SeqAIJKokkos *aijkok;
433: const char *mode;
434: ConstPetscScalarKokkosView xv;
435: PetscScalarKokkosView yv;
436: KokkosCsrMatrix csrmat;
438: PetscFunctionBegin;
439: PetscCall(PetscLogGpuTimeBegin());
440: PetscCall(MatSeqAIJKokkosSyncDevice(A));
441: PetscCall(VecGetKokkosView(xx, &xv));
442: PetscCall(VecGetKokkosViewWrite(yy, &yv));
443: if (A->form_explicit_transpose) {
444: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445: mode = "N";
446: } else {
447: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448: csrmat = aijkok->csrmat;
449: mode = "C";
450: }
451: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452: PetscCall(VecRestoreKokkosView(xx, &xv));
453: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455: PetscCall(PetscLogGpuTimeEnd());
456: PetscFunctionReturn(PETSC_SUCCESS);
457: }
459: /* z = A x + y */
460: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462: Mat_SeqAIJKokkos *aijkok;
463: ConstPetscScalarKokkosView xv;
464: PetscScalarKokkosView zv;
466: PetscFunctionBegin;
467: PetscCall(PetscLogGpuTimeBegin());
468: PetscCall(MatSeqAIJKokkosSyncDevice(A));
469: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470: PetscCall(VecGetKokkosView(xx, &xv));
471: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474: PetscCall(VecRestoreKokkosView(xx, &xv));
475: PetscCall(VecRestoreKokkosView(zz, &zv));
476: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477: PetscCall(PetscLogGpuTimeEnd());
478: PetscFunctionReturn(PETSC_SUCCESS);
479: }
481: /* z = A^T x + y */
482: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484: Mat_SeqAIJKokkos *aijkok;
485: const char *mode;
486: ConstPetscScalarKokkosView xv;
487: PetscScalarKokkosView zv;
488: KokkosCsrMatrix csrmat;
490: PetscFunctionBegin;
491: PetscCall(PetscLogGpuTimeBegin());
492: PetscCall(MatSeqAIJKokkosSyncDevice(A));
493: if (zz != yy) PetscCall(VecCopy(yy, zz));
494: PetscCall(VecGetKokkosView(xx, &xv));
495: PetscCall(VecGetKokkosView(zz, &zv));
496: if (A->form_explicit_transpose) {
497: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498: mode = "N";
499: } else {
500: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501: csrmat = aijkok->csrmat;
502: mode = "T";
503: }
504: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505: PetscCall(VecRestoreKokkosView(xx, &xv));
506: PetscCall(VecRestoreKokkosView(zz, &zv));
507: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508: PetscCall(PetscLogGpuTimeEnd());
509: PetscFunctionReturn(PETSC_SUCCESS);
510: }
512: /* z = A^H x + y */
513: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514: {
515: Mat_SeqAIJKokkos *aijkok;
516: const char *mode;
517: ConstPetscScalarKokkosView xv;
518: PetscScalarKokkosView zv;
519: KokkosCsrMatrix csrmat;
521: PetscFunctionBegin;
522: PetscCall(PetscLogGpuTimeBegin());
523: PetscCall(MatSeqAIJKokkosSyncDevice(A));
524: if (zz != yy) PetscCall(VecCopy(yy, zz));
525: PetscCall(VecGetKokkosView(xx, &xv));
526: PetscCall(VecGetKokkosView(zz, &zv));
527: if (A->form_explicit_transpose) {
528: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529: mode = "N";
530: } else {
531: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532: csrmat = aijkok->csrmat;
533: mode = "C";
534: }
535: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536: PetscCall(VecRestoreKokkosView(xx, &xv));
537: PetscCall(VecRestoreKokkosView(zz, &zv));
538: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539: PetscCall(PetscLogGpuTimeEnd());
540: PetscFunctionReturn(PETSC_SUCCESS);
541: }
543: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544: {
545: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
547: PetscFunctionBegin;
548: switch (op) {
549: case MAT_FORM_EXPLICIT_TRANSPOSE:
550: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552: A->form_explicit_transpose = flg;
553: break;
554: default:
555: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556: break;
557: }
558: PetscFunctionReturn(PETSC_SUCCESS);
559: }
561: /* Depending on reuse, either build a new mat, or use the existing mat */
562: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563: {
564: Mat_SeqAIJ *aseq;
566: PetscFunctionBegin;
567: PetscCall(PetscKokkosInitializeCheck());
568: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570: PetscCall(MatSetType(*newmat, mtype));
571: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
572: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
574: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575: PetscCall(PetscFree(A->defaultvectype));
576: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578: PetscCall(MatSetOps_SeqAIJKokkos(A));
579: aseq = static_cast<Mat_SeqAIJ *>(A->data);
580: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583: }
584: }
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590: */
591: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592: {
593: Mat_SeqAIJ *bseq;
594: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595: Mat mat;
597: PetscFunctionBegin;
598: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600: mat = *B;
601: if (A->assembled) {
602: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605: /* Now copy values to B if needed */
606: if (dupOption == MAT_COPY_VALUES) {
607: if (akok->a_dual.need_sync_device()) {
608: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609: bkok->a_dual.modify_host();
610: } else { /* If device has the latest data, we only copy data on device */
611: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612: bkok->a_dual.modify_device();
613: }
614: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616: bkok->a_dual.modify_host();
617: }
618: mat->spptr = bkok;
619: }
621: PetscCall(PetscFree(mat->defaultvectype));
622: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624: PetscCall(MatSetOps_SeqAIJKokkos(mat));
625: PetscFunctionReturn(PETSC_SUCCESS);
626: }
628: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629: {
630: Mat At;
631: KokkosCsrMatrix internT;
632: Mat_SeqAIJKokkos *atkok, *bkok;
634: PetscFunctionBegin;
635: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638: /* Deep copy internT, as we want to isolate the internal transpose */
639: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641: if (reuse == MAT_INITIAL_MATRIX) *B = At;
642: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644: if ((*B)->assembled) {
645: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655: }
656: PetscFunctionReturn(PETSC_SUCCESS);
657: }
659: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660: {
661: Mat_SeqAIJKokkos *aijkok;
663: PetscFunctionBegin;
664: if (A->factortype == MAT_FACTOR_NONE) {
665: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666: delete aijkok;
667: } else {
668: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669: }
670: A->spptr = NULL;
671: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674: #if defined(PETSC_HAVE_HYPRE)
675: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676: #endif
677: PetscCall(MatDestroy_SeqAIJ(A));
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: /*MC
682: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
684: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
686: Options Database Key:
687: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
689: Level: beginner
691: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692: M*/
693: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694: {
695: PetscFunctionBegin;
696: PetscCall(PetscKokkosInitializeCheck());
697: PetscCall(MatCreate_SeqAIJ(A));
698: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699: PetscFunctionReturn(PETSC_SUCCESS);
700: }
702: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704: {
705: Mat_SeqAIJ *a, *b;
706: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
707: MatScalarKokkosView aa, ba, ca;
708: MatRowMapKokkosView ai, bi, ci;
709: MatColIdxKokkosView aj, bj, cj;
710: PetscInt m, n, nnz, aN;
712: PetscFunctionBegin;
715: PetscAssertPointer(C, 4);
716: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
721: PetscCall(MatSeqAIJKokkosSyncDevice(A));
722: PetscCall(MatSeqAIJKokkosSyncDevice(B));
723: a = static_cast<Mat_SeqAIJ *>(A->data);
724: b = static_cast<Mat_SeqAIJ *>(B->data);
725: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727: aa = akok->a_dual.view_device();
728: ai = akok->i_dual.view_device();
729: ba = bkok->a_dual.view_device();
730: bi = bkok->i_dual.view_device();
731: m = A->rmap->n; /* M, N and nnz of C */
732: n = A->cmap->n + B->cmap->n;
733: nnz = a->nz + b->nz;
734: aN = A->cmap->n; /* N of A */
735: if (reuse == MAT_INITIAL_MATRIX) {
736: aj = akok->j_dual.view_device();
737: bj = bkok->j_dual.view_device();
738: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
739: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
740: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
741: ca = ca_dual.view_device();
742: ci = ci_dual.view_device();
743: cj = cj_dual.view_device();
745: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
746: Kokkos::parallel_for(
747: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
748: PetscInt i = t.league_rank(); /* row i */
749: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
751: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
752: ci(i) = coffset;
753: if (i == m - 1) ci(m) = ai(m) + bi(m);
754: });
756: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
757: if (k < alen) {
758: ca(coffset + k) = aa(ai(i) + k);
759: cj(coffset + k) = aj(ai(i) + k);
760: } else {
761: ca(coffset + k) = ba(bi(i) + k - alen);
762: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
763: }
764: });
765: });
766: ca_dual.modify_device();
767: ci_dual.modify_device();
768: cj_dual.modify_device();
769: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
770: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771: } else if (reuse == MAT_REUSE_MATRIX) {
773: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775: ca = ckok->a_dual.view_device();
776: ci = ckok->i_dual.view_device();
778: Kokkos::parallel_for(
779: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780: PetscInt i = t.league_rank(); /* row i */
781: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784: else ca(ci(i) + k) = ba(bi(i) + k - alen);
785: });
786: });
787: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788: }
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
792: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793: {
794: PetscFunctionBegin;
795: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796: PetscFunctionReturn(PETSC_SUCCESS);
797: }
799: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800: {
801: Mat_Product *product = C->product;
802: Mat A, B;
803: bool transA, transB; /* use bool, since KK needs this type */
804: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
805: Mat_SeqAIJ *c;
806: MatProductData_SeqAIJKokkos *pdata;
807: KokkosCsrMatrix csrmatA, csrmatB;
809: PetscFunctionBegin;
810: MatCheckProduct(C, 1);
811: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
814: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816: // we still do numeric.
817: if (pdata->reusesym) { // numeric reuses results from symbolic
818: pdata->reusesym = PETSC_FALSE;
819: PetscFunctionReturn(PETSC_SUCCESS);
820: }
822: switch (product->type) {
823: case MATPRODUCT_AB:
824: transA = false;
825: transB = false;
826: break;
827: case MATPRODUCT_AtB:
828: transA = true;
829: transB = false;
830: break;
831: case MATPRODUCT_ABt:
832: transA = false;
833: transB = true;
834: break;
835: default:
836: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837: }
839: A = product->A;
840: B = product->B;
841: PetscCall(MatSeqAIJKokkosSyncDevice(A));
842: PetscCall(MatSeqAIJKokkosSyncDevice(B));
843: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
847: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
849: csrmatA = akok->csrmat;
850: csrmatB = bkok->csrmat;
852: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853: if (transA) {
854: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855: transA = false;
856: }
858: if (transB) {
859: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860: transB = false;
861: }
862: PetscCall(PetscLogGpuTimeBegin());
863: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865: auto spgemmHandle = pdata->kh.get_spgemm_handle();
866: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867: #endif
869: PetscCall(PetscLogGpuTimeEnd());
870: PetscCall(MatSeqAIJKokkosModifyDevice(C));
871: /* shorter version of MatAssemblyEnd_SeqAIJ */
872: c = (Mat_SeqAIJ *)C->data;
873: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876: c->reallocs = 0;
877: C->info.mallocs = 0;
878: C->info.nz_unneeded = 0;
879: C->assembled = C->was_assembled = PETSC_TRUE;
880: C->num_ass++;
881: PetscFunctionReturn(PETSC_SUCCESS);
882: }
884: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885: {
886: Mat_Product *product = C->product;
887: MatProductType ptype;
888: Mat A, B;
889: bool transA, transB;
890: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
891: MatProductData_SeqAIJKokkos *pdata;
892: MPI_Comm comm;
893: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
895: PetscFunctionBegin;
896: MatCheckProduct(C, 1);
897: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899: A = product->A;
900: B = product->B;
901: PetscCall(MatSeqAIJKokkosSyncDevice(A));
902: PetscCall(MatSeqAIJKokkosSyncDevice(B));
903: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905: csrmatA = akok->csrmat;
906: csrmatB = bkok->csrmat;
908: ptype = product->type;
909: // Take advantage of the symmetry if true
910: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911: ptype = MATPRODUCT_AB;
912: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913: }
914: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915: ptype = MATPRODUCT_AB;
916: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917: }
919: switch (ptype) {
920: case MATPRODUCT_AB:
921: transA = false;
922: transB = false;
923: PetscCall(MatSetBlockSizesFromMats(C, A, B));
924: break;
925: case MATPRODUCT_AtB:
926: transA = true;
927: transB = false;
928: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930: break;
931: case MATPRODUCT_ABt:
932: transA = false;
933: transB = true;
934: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936: break;
937: default:
938: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939: }
940: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941: pdata->reusesym = product->api_user;
943: /* TODO: add command line options to select spgemm algorithms */
944: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
946: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950: #endif
951: #endif
952: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
954: PetscCall(PetscLogGpuTimeBegin());
955: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956: if (transA) {
957: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958: transA = false;
959: }
961: if (transB) {
962: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963: transB = false;
964: }
966: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969: calling new Mat_SeqAIJKokkos().
970: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971: */
972: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975: auto spgemmHandle = pdata->kh.get_spgemm_handle();
976: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977: #endif
978: PetscCall(PetscLogGpuTimeEnd());
980: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983: PetscFunctionReturn(PETSC_SUCCESS);
984: }
986: /* handles sparse matrix matrix ops */
987: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988: {
989: Mat_Product *product = mat->product;
990: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
992: PetscFunctionBegin;
993: MatCheckProduct(mat, 1);
994: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996: if (Biskok && Ciskok) {
997: switch (product->type) {
998: case MATPRODUCT_AB:
999: case MATPRODUCT_AtB:
1000: case MATPRODUCT_ABt:
1001: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002: break;
1003: case MATPRODUCT_PtAP:
1004: case MATPRODUCT_RARt:
1005: case MATPRODUCT_ABC:
1006: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007: break;
1008: default:
1009: break;
1010: }
1011: } else { /* fallback for AIJ */
1012: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013: }
1014: PetscFunctionReturn(PETSC_SUCCESS);
1015: }
1017: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018: {
1019: Mat_SeqAIJKokkos *aijkok;
1021: PetscFunctionBegin;
1022: PetscCall(PetscLogGpuTimeBegin());
1023: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028: PetscCall(PetscLogGpuTimeEnd());
1029: PetscFunctionReturn(PETSC_SUCCESS);
1030: }
1032: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034: {
1035: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1037: PetscFunctionBegin;
1038: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1041: PetscCall(PetscLogGpuTimeBegin());
1042: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044: const auto &Aa = aijkok->a_dual.view_device();
1045: const auto &Adiag = aijkok->diag_dual.view_device();
1046: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048: PetscCall(PetscLogGpuFlops(n));
1049: PetscCall(PetscLogGpuTimeEnd());
1050: } else { // need reassembly, very slow!
1051: PetscCall(MatShift_Basic(A, a));
1052: }
1053: PetscFunctionReturn(PETSC_SUCCESS);
1054: }
1056: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057: {
1058: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1060: PetscFunctionBegin;
1061: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062: ConstPetscScalarKokkosView dv;
1063: PetscInt n, nv;
1065: PetscCall(PetscLogGpuTimeBegin());
1066: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067: PetscCall(VecGetKokkosView(D, &dv));
1068: PetscCall(VecGetLocalSize(D, &nv));
1069: n = PetscMin(Y->rmap->n, Y->cmap->n);
1070: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1072: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073: const auto &Aa = aijkok->a_dual.view_device();
1074: const auto &Adiag = aijkok->diag_dual.view_device();
1075: PetscCallCXX(Kokkos::parallel_for(
1076: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078: else Aa(Adiag(i)) += dv(i);
1079: }));
1080: PetscCall(VecRestoreKokkosView(D, &dv));
1081: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082: PetscCall(PetscLogGpuFlops(n));
1083: PetscCall(PetscLogGpuTimeEnd());
1084: } else { // need reassembly, very slow!
1085: PetscCall(MatDiagonalSet_Default(Y, D, is));
1086: }
1087: PetscFunctionReturn(PETSC_SUCCESS);
1088: }
1090: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091: {
1092: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094: ConstPetscScalarKokkosView lv, rv;
1096: PetscFunctionBegin;
1097: PetscCall(PetscLogGpuTimeBegin());
1098: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100: const auto &Aa = aijkok->a_dual.view_device();
1101: const auto &Ai = aijkok->i_dual.view_device();
1102: const auto &Aj = aijkok->j_dual.view_device();
1103: if (ll) {
1104: PetscCall(VecGetLocalSize(ll, &m));
1105: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106: PetscCall(VecGetKokkosView(ll, &lv));
1107: PetscCallCXX(Kokkos::parallel_for( // for each row
1108: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109: PetscInt i = t.league_rank(); // row i
1110: PetscInt len = Ai(i + 1) - Ai(i);
1111: // scale entries on the row
1112: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113: }));
1114: PetscCall(VecRestoreKokkosView(ll, &lv));
1115: PetscCall(PetscLogGpuFlops(nz));
1116: }
1117: if (rr) {
1118: PetscCall(VecGetLocalSize(rr, &n));
1119: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120: PetscCall(VecGetKokkosView(rr, &rv));
1121: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123: PetscCall(VecRestoreKokkosView(rr, &lv));
1124: PetscCall(PetscLogGpuFlops(nz));
1125: }
1126: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127: PetscCall(PetscLogGpuTimeEnd());
1128: PetscFunctionReturn(PETSC_SUCCESS);
1129: }
1131: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132: {
1133: Mat_SeqAIJKokkos *aijkok;
1135: PetscFunctionBegin;
1136: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137: if (aijkok) { /* Only zero the device if data is already there */
1138: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140: } else { /* Might be preallocated but not assembled */
1141: PetscCall(MatZeroEntries_SeqAIJ(A));
1142: }
1143: PetscFunctionReturn(PETSC_SUCCESS);
1144: }
1146: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147: {
1148: Mat_SeqAIJKokkos *aijkok;
1149: PetscInt n;
1150: PetscScalarKokkosView xv;
1152: PetscFunctionBegin;
1153: PetscCall(VecGetLocalSize(x, &n));
1154: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1157: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1160: const auto &Aa = aijkok->a_dual.view_device();
1161: const auto &Ai = aijkok->i_dual.view_device();
1162: const auto &Adiag = aijkok->diag_dual.view_device();
1164: PetscCall(VecGetKokkosViewWrite(x, &xv));
1165: Kokkos::parallel_for(
1166: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168: else xv(i) = 0;
1169: });
1170: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171: PetscFunctionReturn(PETSC_SUCCESS);
1172: }
1174: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176: {
1177: Mat_SeqAIJKokkos *aijkok;
1179: PetscFunctionBegin;
1181: PetscAssertPointer(kv, 2);
1182: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185: *kv = aijkok->a_dual.view_device();
1186: PetscFunctionReturn(PETSC_SUCCESS);
1187: }
1189: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190: {
1191: PetscFunctionBegin;
1193: PetscAssertPointer(kv, 2);
1194: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199: {
1200: Mat_SeqAIJKokkos *aijkok;
1202: PetscFunctionBegin;
1204: PetscAssertPointer(kv, 2);
1205: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208: *kv = aijkok->a_dual.view_device();
1209: PetscFunctionReturn(PETSC_SUCCESS);
1210: }
1212: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213: {
1214: PetscFunctionBegin;
1216: PetscAssertPointer(kv, 2);
1217: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219: PetscFunctionReturn(PETSC_SUCCESS);
1220: }
1222: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223: {
1224: Mat_SeqAIJKokkos *aijkok;
1226: PetscFunctionBegin;
1228: PetscAssertPointer(kv, 2);
1229: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231: *kv = aijkok->a_dual.view_device();
1232: PetscFunctionReturn(PETSC_SUCCESS);
1233: }
1235: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236: {
1237: PetscFunctionBegin;
1239: PetscAssertPointer(kv, 2);
1240: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242: PetscFunctionReturn(PETSC_SUCCESS);
1243: }
1245: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246: {
1247: Mat_SeqAIJKokkos *akok;
1249: PetscFunctionBegin;
1250: auto exec = PetscGetKokkosExecutionSpace();
1251: // Create host copies of the input aij
1252: auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1253: auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1254: // Don't copy the vals to the host now
1255: auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1257: MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1258: // Note we have modified device data so it will copy lazily
1259: a_dual.modify_device();
1260: MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1261: MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1263: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1264: PetscCall(MatCreate(comm, A));
1265: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1266: PetscFunctionReturn(PETSC_SUCCESS);
1267: }
1269: /* Computes Y += alpha X */
1270: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1271: {
1272: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1273: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1274: ConstMatScalarKokkosView Xa;
1275: MatScalarKokkosView Ya;
1276: auto exec = PetscGetKokkosExecutionSpace();
1278: PetscFunctionBegin;
1279: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1280: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1281: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1282: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1283: PetscCall(PetscLogGpuTimeBegin());
1285: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1286: PetscBool e;
1287: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1288: if (e) {
1289: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1290: if (e) pattern = SAME_NONZERO_PATTERN;
1291: }
1292: }
1294: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1295: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1296: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1297: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1298: */
1299: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1300: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1301: Xa = xkok->a_dual.view_device();
1302: Ya = ykok->a_dual.view_device();
1304: if (pattern == SAME_NONZERO_PATTERN) {
1305: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1306: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1307: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1308: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1309: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1311: Kokkos::parallel_for(
1312: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1313: PetscInt i = t.league_rank(); // row i
1314: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1315: // Only one thread works in a team
1316: PetscInt p, q = Yi(i);
1317: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1318: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1319: if (Xj(p) == Yj(q)) { // Found it
1320: Ya(q) += alpha * Xa(p);
1321: q++;
1322: } else {
1323: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1324: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1325: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1326: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1327: #else
1328: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1329: #endif
1330: }
1331: }
1332: });
1333: });
1334: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1335: } else { // different nonzero patterns
1336: Mat Z;
1337: KokkosCsrMatrix zcsr;
1338: KernelHandle kh;
1339: kh.create_spadd_handle(true); // X, Y are sorted
1340: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1341: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1342: zkok = new Mat_SeqAIJKokkos(zcsr);
1343: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1344: PetscCall(MatHeaderReplace(Y, &Z));
1345: kh.destroy_spadd_handle();
1346: }
1347: PetscCall(PetscLogGpuTimeEnd());
1348: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1349: PetscFunctionReturn(PETSC_SUCCESS);
1350: }
1352: struct MatCOOStruct_SeqAIJKokkos {
1353: PetscCount n;
1354: PetscCount Atot;
1355: PetscInt nz;
1356: PetscCountKokkosView jmap;
1357: PetscCountKokkosView perm;
1359: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1360: {
1361: nz = coo_h->nz;
1362: n = coo_h->n;
1363: Atot = coo_h->Atot;
1364: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1365: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1366: }
1367: };
1369: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1370: {
1371: PetscFunctionBegin;
1372: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1373: PetscFunctionReturn(PETSC_SUCCESS);
1374: }
1376: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1377: {
1378: Mat_SeqAIJKokkos *akok;
1379: Mat_SeqAIJ *aseq;
1380: PetscContainer container_h;
1381: MatCOOStruct_SeqAIJ *coo_h;
1382: MatCOOStruct_SeqAIJKokkos *coo_d;
1384: PetscFunctionBegin;
1385: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1386: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1387: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1388: delete akok;
1389: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1390: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1392: // Copy the COO struct to device
1393: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1394: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1395: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1397: // Put the COO struct in a container and then attach that to the matrix
1398: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1399: PetscFunctionReturn(PETSC_SUCCESS);
1400: }
1402: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1403: {
1404: MatScalarKokkosView Aa;
1405: ConstMatScalarKokkosView kv;
1406: PetscMemType memtype;
1407: PetscContainer container;
1408: MatCOOStruct_SeqAIJKokkos *coo;
1410: PetscFunctionBegin;
1411: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1412: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1414: const auto &n = coo->n;
1415: const auto &Annz = coo->nz;
1416: const auto &jmap = coo->jmap;
1417: const auto &perm = coo->perm;
1419: PetscCall(PetscGetMemType(v, &memtype));
1420: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1421: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1422: } else {
1423: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1424: }
1426: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1427: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1429: PetscCall(PetscLogGpuTimeBegin());
1430: Kokkos::parallel_for(
1431: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1432: PetscScalar sum = 0.0;
1433: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1434: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1435: });
1436: PetscCall(PetscLogGpuTimeEnd());
1438: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1439: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1440: PetscFunctionReturn(PETSC_SUCCESS);
1441: }
1443: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1444: {
1445: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1447: PetscFunctionBegin;
1448: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1449: A->boundtocpu = PETSC_FALSE;
1451: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1452: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1453: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1454: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1455: A->ops->scale = MatScale_SeqAIJKokkos;
1456: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1457: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1458: A->ops->mult = MatMult_SeqAIJKokkos;
1459: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1460: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1461: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1462: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1463: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1464: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1465: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1466: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1467: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1468: A->ops->shift = MatShift_SeqAIJKokkos;
1469: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1470: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1471: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1472: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1473: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1474: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1475: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1476: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1477: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1479: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1480: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1481: #if defined(PETSC_HAVE_HYPRE)
1482: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1483: #endif
1484: PetscFunctionReturn(PETSC_SUCCESS);
1485: }
1487: /*
1488: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1490: Input Parameters:
1491: + A - the MATSEQAIJKOKKOS matrix
1492: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1493: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1494: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1495: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1497: Output Parameter:
1498: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1499: */
1500: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1501: {
1502: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1503: PetscInt nblocks = bs.extent(0) - 1;
1505: PetscFunctionBegin;
1506: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1508: // Pull out the diagonal blocks of the matrix and then invert the blocks
1509: auto Aa = akok->a_dual.view_device();
1510: auto Ai = akok->i_dual.view_device();
1511: auto Aj = akok->j_dual.view_device();
1512: auto Adiag = akok->diag_dual.view_device();
1513: // TODO: how to tune the team size?
1514: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1515: auto ts = Kokkos::AUTO();
1516: #else
1517: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1518: #endif
1519: PetscCallCXX(Kokkos::parallel_for(
1520: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1521: const PetscInt bid = teamMember.league_rank(); // block id
1522: const PetscInt rstart = bs(bid); // this block starts from this row
1523: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1524: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1525: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1527: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1528: PetscInt i = rstart + r; // i-th row in A
1530: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1531: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1533: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1534: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1535: B(r, c) = 0.0;
1536: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1537: B(r, c) = Aa(first + c);
1538: } else { // this entry does not show up in the CSR
1539: B(r, c) = 0.0;
1540: }
1541: }
1542: } else { // rare case that the diagonal does not exist
1543: const PetscInt begin = Ai(i);
1544: const PetscInt end = Ai(i + 1);
1545: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1546: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1547: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1548: else if (Aj(j) >= rstart + m) break;
1549: }
1550: }
1551: });
1553: // LU-decompose B (w/o pivoting) and then invert B
1554: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1555: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1556: }));
1557: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1558: PetscFunctionReturn(PETSC_SUCCESS);
1559: }
1561: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1562: {
1563: Mat_SeqAIJ *aseq;
1564: PetscInt i, m, n;
1565: auto exec = PetscGetKokkosExecutionSpace();
1567: PetscFunctionBegin;
1568: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1570: m = akok->nrows();
1571: n = akok->ncols();
1572: PetscCall(MatSetSizes(A, m, n, m, n));
1573: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1575: /* Set up data structures of A as a MATSEQAIJ */
1576: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1577: aseq = (Mat_SeqAIJ *)A->data;
1579: PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1580: PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1582: aseq->i = akok->i_host_data();
1583: aseq->j = akok->j_host_data();
1584: aseq->a = akok->a_host_data();
1585: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1586: aseq->free_a = PETSC_FALSE;
1587: aseq->free_ij = PETSC_FALSE;
1588: aseq->nz = akok->nnz();
1589: aseq->maxnz = aseq->nz;
1591: PetscCall(PetscMalloc1(m, &aseq->imax));
1592: PetscCall(PetscMalloc1(m, &aseq->ilen));
1593: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1595: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1596: akok->nonzerostate = A->nonzerostate;
1597: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1598: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1599: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1600: PetscFunctionReturn(PETSC_SUCCESS);
1601: }
1603: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1604: {
1605: PetscFunctionBegin;
1606: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1607: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1608: PetscFunctionReturn(PETSC_SUCCESS);
1609: }
1611: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1612: {
1613: Mat_SeqAIJKokkos *akok;
1615: PetscFunctionBegin;
1616: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1617: PetscCall(MatCreate(comm, A));
1618: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619: PetscFunctionReturn(PETSC_SUCCESS);
1620: }
1622: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1624: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1625: */
1626: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1627: {
1628: PetscFunctionBegin;
1629: PetscCall(MatCreate(comm, A));
1630: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1631: PetscFunctionReturn(PETSC_SUCCESS);
1632: }
1634: /*@C
1635: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1636: (the default parallel PETSc format). This matrix will ultimately be handled by
1637: Kokkos for calculations.
1639: Collective
1641: Input Parameters:
1642: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1643: . m - number of rows
1644: . n - number of columns
1645: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1646: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1648: Output Parameter:
1649: . A - the matrix
1651: Level: intermediate
1653: Notes:
1654: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1655: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1656: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1658: The AIJ format, also called
1659: compressed row storage, is fully compatible with standard Fortran
1660: storage. That is, the stored row and column indices can begin at
1661: either one (as in Fortran) or zero.
1663: Specify the preallocated storage with either `nz` or `nnz` (not both).
1664: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1665: allocation.
1667: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1668: @*/
1669: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1670: {
1671: PetscFunctionBegin;
1672: PetscCall(PetscKokkosInitializeCheck());
1673: PetscCall(MatCreate(comm, A));
1674: PetscCall(MatSetSizes(*A, m, n, m, n));
1675: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1676: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1677: PetscFunctionReturn(PETSC_SUCCESS);
1678: }
1680: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1681: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1682: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1683: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1684: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1685: {
1686: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1687: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1688: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1690: PetscFunctionBegin;
1691: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1692: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1693: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1694: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1695: }
1696: PetscFunctionReturn(PETSC_SUCCESS);
1697: }
1699: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1700: {
1701: const PetscInt n = A->rmap->n;
1702: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1703: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1704: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1706: PetscFunctionBegin;
1707: if (!factors->transpose_updated) {
1708: if (has_upper) {
1709: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1710: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1711: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1712: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1713: }
1715: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1716: if (!factors->U) {
1717: Mat_SeqAIJ *seq;
1719: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1720: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1722: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1723: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1724: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1725: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1726: } else {
1727: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1728: }
1729: // Copy Ut from host to device
1730: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1731: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1732: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1733: } else { // If U was computed on device, we also compute the transpose there
1734: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1735: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1736: factors->jU_d, factors->aU_d,
1737: factors->iUt_d, factors->jUt_d,
1738: factors->aUt_d));
1739: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1740: }
1741: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1742: }
1744: // do the same for L with LU
1745: if (has_lower) {
1746: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1747: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1748: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1749: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1750: }
1752: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1753: if (!factors->L) {
1754: Mat_SeqAIJ *seq;
1756: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1757: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1759: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1760: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1761: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1762: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1763: } else {
1764: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1765: }
1766: // Copy Lt from host to device
1767: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1768: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1769: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1770: } else { // If L was computed on device, we also compute the transpose there
1771: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1772: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1773: factors->jL_d, factors->aL_d,
1774: factors->iLt_d, factors->jLt_d,
1775: factors->aLt_d));
1776: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1777: }
1778: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1779: }
1781: factors->transpose_updated = PETSC_TRUE;
1782: }
1783: PetscFunctionReturn(PETSC_SUCCESS);
1784: }
1786: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1787: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1788: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1789: {
1790: auto exec = PetscGetKokkosExecutionSpace();
1791: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1792: PetscInt m = A->rmap->n;
1793: PetscScalarKokkosView D = factors->D_d;
1794: PetscScalarKokkosView X, Y, B; // alias
1795: ConstPetscScalarKokkosView b;
1796: PetscScalarKokkosView x;
1797: PetscIntKokkosView &rowperm = factors->rowperm;
1798: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1800: PetscFunctionBegin;
1801: PetscCall(PetscLogGpuTimeBegin());
1802: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1803: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1804: PetscCall(VecGetKokkosView(bb, &b));
1805: PetscCall(VecGetKokkosViewWrite(xx, &x));
1807: // Solve U^T Y = B
1808: if (identity) { // Reorder b with the row permutation
1809: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1810: Y = factors->workVector;
1811: } else {
1812: B = factors->workVector;
1813: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1814: Y = x;
1815: }
1816: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1818: // Solve diag(D) Y' = Y.
1819: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1820: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1822: // Solve UX = Y
1823: if (identity) {
1824: X = x;
1825: } else {
1826: X = factors->workVector; // B is not needed anymore
1827: }
1828: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1830: // Reorder X with the inverse column (row) permutation
1831: if (!identity) {
1832: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1833: }
1835: PetscCall(VecRestoreKokkosView(bb, &b));
1836: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1837: PetscCall(PetscLogGpuTimeEnd());
1838: PetscFunctionReturn(PETSC_SUCCESS);
1839: }
1841: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1842: // R and C are represented by rowperm and colperm in factors.
1843: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1844: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1845: {
1846: auto exec = PetscGetKokkosExecutionSpace();
1847: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1848: PetscInt m = A->rmap->n;
1849: PetscScalarKokkosView X, Y, B; // alias
1850: ConstPetscScalarKokkosView b;
1851: PetscScalarKokkosView x;
1852: PetscIntKokkosView &rowperm = factors->rowperm;
1853: PetscIntKokkosView &colperm = factors->colperm;
1854: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1855: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1857: PetscFunctionBegin;
1858: PetscCall(PetscLogGpuTimeBegin());
1859: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1860: PetscCall(VecGetKokkosView(bb, &b));
1861: PetscCall(VecGetKokkosViewWrite(xx, &x));
1863: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1864: if (row_identity) {
1865: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1866: Y = factors->workVector;
1867: } else {
1868: B = factors->workVector;
1869: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1870: Y = x;
1871: }
1872: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1874: // Solve U C^- x = Y
1875: if (col_identity) {
1876: X = x;
1877: } else {
1878: X = factors->workVector;
1879: }
1880: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1882: // x = C X; Reorder X with the inverse col permutation
1883: if (!col_identity) {
1884: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1885: }
1887: PetscCall(VecRestoreKokkosView(bb, &b));
1888: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1889: PetscCall(PetscLogGpuTimeEnd());
1890: PetscFunctionReturn(PETSC_SUCCESS);
1891: }
1893: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1894: // R and C are represented by rowperm and colperm in factors.
1895: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1896: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1897: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1898: {
1899: auto exec = PetscGetKokkosExecutionSpace();
1900: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1901: PetscInt m = A->rmap->n;
1902: PetscScalarKokkosView X, Y, B; // alias
1903: ConstPetscScalarKokkosView b;
1904: PetscScalarKokkosView x;
1905: PetscIntKokkosView &rowperm = factors->rowperm;
1906: PetscIntKokkosView &colperm = factors->colperm;
1907: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1908: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1910: PetscFunctionBegin;
1911: PetscCall(PetscLogGpuTimeBegin());
1912: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1913: PetscCall(VecGetKokkosView(bb, &b));
1914: PetscCall(VecGetKokkosViewWrite(xx, &x));
1916: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1917: if (col_identity) { // Reorder b with the col permutation
1918: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1919: Y = factors->workVector;
1920: } else {
1921: B = factors->workVector;
1922: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1923: Y = x;
1924: }
1925: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1927: // Solve L^T X = Y
1928: if (row_identity) {
1929: X = x;
1930: } else {
1931: X = factors->workVector;
1932: }
1933: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1935: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1936: if (!row_identity) {
1937: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1938: }
1940: PetscCall(VecRestoreKokkosView(bb, &b));
1941: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1942: PetscCall(PetscLogGpuTimeEnd());
1943: PetscFunctionReturn(PETSC_SUCCESS);
1944: }
1946: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1947: {
1948: PetscFunctionBegin;
1949: PetscCall(MatSeqAIJKokkosSyncHost(A));
1950: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1952: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1953: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1954: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1955: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1956: const MatScalar *Ba = b->a;
1957: PetscInt m = B->rmap->n, n = B->cmap->n;
1959: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1960: // Allocate memory and copy the structure
1961: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1962: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1963: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1964: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1965: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1966: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1968: PetscInt *Li = factors->iL_h.data();
1969: PetscInt *Lj = factors->jL_h.data();
1970: PetscInt *Ui = factors->iU_h.data();
1971: PetscInt *Uj = factors->jU_h.data();
1973: Li[0] = Ui[0] = 0;
1974: for (PetscInt i = 0; i < m; i++) {
1975: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1976: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1978: PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1979: Lj[Li[i] + llen] = i; // diagonal entry of L
1981: Uj[Ui[i]] = i; // diagonal entry of U
1982: PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal
1984: Li[i + 1] = Li[i] + llen + 1;
1985: Ui[i + 1] = Ui[i] + ulen;
1986: }
1988: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1989: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1990: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1991: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1992: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1993: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1995: // Copy row/col permutation to device
1996: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1997: PetscBool row_identity;
1998: PetscCall(ISIdentity(rowperm, &row_identity));
1999: if (!row_identity) {
2000: const PetscInt *ip;
2002: PetscCall(ISGetIndices(rowperm, &ip));
2003: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
2004: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2005: PetscCall(ISRestoreIndices(rowperm, &ip));
2006: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2007: }
2009: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
2010: PetscBool col_identity;
2011: PetscCall(ISIdentity(colperm, &col_identity));
2012: if (!col_identity) {
2013: const PetscInt *ip;
2015: PetscCall(ISGetIndices(colperm, &ip));
2016: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2017: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2018: PetscCall(ISRestoreIndices(colperm, &ip));
2019: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2020: }
2022: /* Create sptrsv handles for L, U and their transpose */
2023: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2024: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2025: #else
2026: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2027: #endif
2028: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2029: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2030: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2031: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2032: }
2034: // Copy the value
2035: for (PetscInt i = 0; i < m; i++) {
2036: PetscInt llen = Bi[i + 1] - Bi[i];
2037: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2038: const PetscInt *Li = factors->iL_h.data();
2039: const PetscInt *Ui = factors->iU_h.data();
2041: PetscScalar *La = factors->aL_h.data();
2042: PetscScalar *Ua = factors->aU_h.data();
2044: PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2045: La[Li[i] + llen] = 1.0; // diagonal entry
2047: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2048: PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2049: }
2051: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2052: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2053: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2054: factors->transpose_updated = PETSC_FALSE;
2055: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2057: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2058: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2059: }
2061: B->ops->matsolve = NULL;
2062: B->ops->matsolvetranspose = NULL;
2063: PetscFunctionReturn(PETSC_SUCCESS);
2064: }
2066: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2067: {
2068: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2069: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2070: PetscInt fill_lev = info->levels;
2072: PetscFunctionBegin;
2073: PetscCall(PetscLogGpuTimeBegin());
2074: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2075: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2077: auto a_d = aijkok->a_dual.view_device();
2078: auto i_d = aijkok->i_dual.view_device();
2079: auto j_d = aijkok->j_dual.view_device();
2081: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2083: B->assembled = PETSC_TRUE;
2084: B->preallocated = PETSC_TRUE;
2085: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2086: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2087: B->ops->matsolve = NULL;
2088: B->ops->matsolvetranspose = NULL;
2090: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2091: factors->transpose_updated = PETSC_FALSE;
2092: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2093: /* TODO: log flops, but how to know that? */
2094: PetscCall(PetscLogGpuTimeEnd());
2095: PetscFunctionReturn(PETSC_SUCCESS);
2096: }
2098: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2099: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2100: {
2101: Mat_SeqAIJKokkos *aijkok;
2102: Mat_SeqAIJ *b;
2103: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2104: PetscInt fill_lev = info->levels;
2105: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2106: PetscInt n = A->rmap->n;
2108: PetscFunctionBegin;
2109: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2110: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2112: /* Create a spiluk handle and then do symbolic factorization */
2113: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2114: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2116: auto spiluk_handle = factors->kh.get_spiluk_handle();
2118: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2119: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2120: Kokkos::realloc(factors->iU_d, n + 1);
2121: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2123: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2124: auto i_d = aijkok->i_dual.view_device();
2125: auto j_d = aijkok->j_dual.view_device();
2126: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2127: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2129: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2130: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2131: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2132: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2134: /* TODO: add options to select sptrsv algorithms */
2135: /* Create sptrsv handles for L, U and their transpose */
2136: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2137: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2138: #else
2139: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2140: #endif
2142: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2143: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2144: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2145: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2147: /* Fill fields of the factor matrix B */
2148: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2149: b = (Mat_SeqAIJ *)B->data;
2150: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2151: B->info.fill_ratio_given = info->fill;
2152: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2154: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2155: PetscFunctionReturn(PETSC_SUCCESS);
2156: }
2158: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2159: {
2160: PetscFunctionBegin;
2161: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2162: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2163: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2164: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2165: PetscFunctionReturn(PETSC_SUCCESS);
2166: }
2168: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2169: {
2170: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2172: PetscFunctionBegin;
2173: if (!info->factoronhost) {
2174: PetscCall(ISIdentity(isrow, &row_identity));
2175: PetscCall(ISIdentity(iscol, &col_identity));
2176: }
2178: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2179: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2181: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2182: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2183: } else {
2184: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2185: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2186: }
2187: PetscFunctionReturn(PETSC_SUCCESS);
2188: }
2190: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2191: {
2192: PetscFunctionBegin;
2193: PetscCall(MatSeqAIJKokkosSyncHost(A));
2194: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2196: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2197: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2198: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2199: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2200: const MatScalar *Ba = b->a;
2201: PetscInt m = B->rmap->n;
2203: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2204: // Allocate memory and copy the structure
2205: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2206: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2207: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2208: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2209: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2210: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2212: // Build jU_h from the skewed Aj
2213: PetscInt *Uj = factors->jU_h.data();
2214: for (PetscInt i = 0; i < m; i++) {
2215: PetscInt ulen = Bi[i + 1] - Bi[i];
2216: Uj[Bi[i]] = i; // diagonal entry
2217: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2218: }
2220: // Copy iU, jU to device
2221: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2222: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2224: // Copy row/col permutation to device
2225: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2226: PetscBool row_identity;
2227: PetscCall(ISIdentity(rowperm, &row_identity));
2228: if (!row_identity) {
2229: const PetscInt *ip;
2231: PetscCall(ISGetIndices(rowperm, &ip));
2232: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2233: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2234: PetscCall(ISRestoreIndices(rowperm, &ip));
2235: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2236: }
2238: // Create sptrsv handles for U and U^T
2239: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2240: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2241: #else
2242: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2243: #endif
2244: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2245: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2246: }
2247: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2248: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2249: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2251: // Copy the value
2252: PetscScalar *Ua = factors->aU_h.data();
2253: PetscScalar *D = factors->D_h.data();
2254: for (PetscInt i = 0; i < m; i++) {
2255: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2256: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2257: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2258: }
2259: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2260: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2262: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2263: factors->transpose_updated = PETSC_FALSE;
2264: }
2266: B->ops->matsolve = NULL;
2267: B->ops->matsolvetranspose = NULL;
2268: PetscFunctionReturn(PETSC_SUCCESS);
2269: }
2271: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2272: {
2273: PetscFunctionBegin;
2274: if (info->solveonhost) {
2275: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2276: PetscCall(MatSetType(B, MATSEQSBAIJ));
2277: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2278: }
2280: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2282: if (!info->solveonhost) {
2283: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2284: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2285: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2286: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2287: }
2288: PetscFunctionReturn(PETSC_SUCCESS);
2289: }
2291: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2292: {
2293: PetscFunctionBegin;
2294: if (info->solveonhost) {
2295: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2296: PetscCall(MatSetType(B, MATSEQSBAIJ));
2297: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2298: }
2300: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2302: if (!info->solveonhost) {
2303: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2304: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2305: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2306: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2307: }
2308: PetscFunctionReturn(PETSC_SUCCESS);
2309: }
2311: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2312: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2313: {
2314: PetscFunctionBegin;
2315: *type = MATSOLVERKOKKOS;
2316: PetscFunctionReturn(PETSC_SUCCESS);
2317: }
2319: /*MC
2320: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2321: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2323: Level: beginner
2325: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2326: M*/
2327: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2328: {
2329: PetscInt n = A->rmap->n;
2330: MPI_Comm comm;
2332: PetscFunctionBegin;
2333: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2334: PetscCall(MatCreate(comm, B));
2335: PetscCall(MatSetSizes(*B, n, n, n, n));
2336: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2337: (*B)->factortype = ftype;
2338: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2339: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2340: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2342: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2343: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2344: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2345: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2346: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2347: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2348: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2349: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2350: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2351: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2352: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2353: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2355: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2356: (*B)->canuseordering = PETSC_TRUE;
2357: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2358: PetscFunctionReturn(PETSC_SUCCESS);
2359: }
2361: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2362: {
2363: PetscFunctionBegin;
2364: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2365: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2366: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2367: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2368: PetscFunctionReturn(PETSC_SUCCESS);
2369: }
2371: /* Utility to print out a KokkosCsrMatrix for debugging */
2372: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2373: {
2374: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2375: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2376: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2377: const PetscInt *i = iv.data();
2378: const PetscInt *j = jv.data();
2379: const PetscScalar *a = av.data();
2380: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2382: PetscFunctionBegin;
2383: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2384: for (PetscInt k = 0; k < m; k++) {
2385: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2386: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2387: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2388: }
2389: PetscFunctionReturn(PETSC_SUCCESS);
2390: }