Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsys.h>
10: #include <Kokkos_Core.hpp>
11: #include <KokkosBlas.hpp>
12: #include <KokkosSparse_CrsMatrix.hpp>
14: // To suppress compiler warnings:
15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19: #define DISABLE_CUSPARSE_DEPRECATED
20: #include <KokkosSparse_spmv.hpp>
22: #include <KokkosSparse_spiluk.hpp>
23: #include <KokkosSparse_sptrsv.hpp>
24: #include <KokkosSparse_spgemm.hpp>
25: #include <KokkosSparse_spadd.hpp>
26: #include <KokkosBatched_LU_Decl.hpp>
27: #include <KokkosBatched_InverseLU_Decl.hpp>
29: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32: #include <KokkosSparse_Utils.hpp>
33: using KokkosSparse::sort_crs_matrix;
34: using KokkosSparse::Impl::transpose_matrix;
35: #else
36: #include <KokkosKernels_Sorting.hpp>
37: using KokkosKernels::sort_crs_matrix;
38: using KokkosKernels::Impl::transpose_matrix;
39: #endif
41: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42: using KokkosSparse::spiluk_symbolic;
43: using KokkosSparse::spiluk_numeric;
44: using KokkosSparse::sptrsv_symbolic;
45: using KokkosSparse::sptrsv_solve;
46: using KokkosSparse::Experimental::SPTRSVAlgorithm;
47: using KokkosSparse::Experimental::SPILUKAlgorithm;
48: #else
49: using KokkosSparse::Experimental::spiluk_symbolic;
50: using KokkosSparse::Experimental::spiluk_numeric;
51: using KokkosSparse::Experimental::sptrsv_symbolic;
52: using KokkosSparse::Experimental::sptrsv_solve;
53: using KokkosSparse::Experimental::SPTRSVAlgorithm;
54: using KokkosSparse::Experimental::SPILUKAlgorithm;
55: #endif
57: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61: In the latter case, it is important to set a_dual's sync state correctly.
62: */
63: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64: {
65: Mat_SeqAIJ *aijseq;
66: Mat_SeqAIJKokkos *aijkok;
68: PetscFunctionBegin;
69: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75: /* If aijkok does not exist, we just copy i, j to device.
76: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77: In both cases, we build a new aijkok structure.
78: */
79: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80: if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81: PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84: PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85: PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86: PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87: aijseq->free_a = PETSC_TRUE;
88: aijseq->free_ij = PETSC_TRUE;
89: /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90: we have the CsrMatrix on device first and then copy to host, followed by
91: MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92: One could improve it by not using NULL garray.
93: */
94: }
95: delete aijkok;
96: aijkok = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
97: A->spptr = aijkok;
98: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99: const PetscInt *adiag;
100: /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
101: PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
102: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
103: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
104: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
105: }
106: PetscFunctionReturn(PETSC_SUCCESS);
107: }
109: /* Sync CSR data to device if not yet */
110: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
111: {
112: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
114: PetscFunctionBegin;
115: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
116: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
117: if (aijkok->a_dual.need_sync_device()) {
118: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
119: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
120: aijkok->hermitian_updated = PETSC_FALSE;
121: }
122: PetscFunctionReturn(PETSC_SUCCESS);
123: }
125: /* Mark the CSR data on device as modified */
126: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
127: {
128: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
130: PetscFunctionBegin;
131: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
132: aijkok->a_dual.clear_sync_state();
133: aijkok->a_dual.modify_device();
134: aijkok->transpose_updated = PETSC_FALSE;
135: aijkok->hermitian_updated = PETSC_FALSE;
136: PetscCall(PetscObjectStateIncrease((PetscObject)A));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
141: {
142: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143: auto exec = PetscGetKokkosExecutionSpace();
145: PetscFunctionBegin;
146: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
147: /* We do not expect one needs factors on host */
148: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
149: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
150: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
151: PetscFunctionReturn(PETSC_SUCCESS);
152: }
154: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
155: {
156: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
158: PetscFunctionBegin;
159: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
160: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
161: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
162: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
163: */
164: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
165: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
166: *array = aijkok->a_dual.view_host().data();
167: } else { /* Happens when calling MatSetValues on a newly created matrix */
168: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
169: }
170: PetscFunctionReturn(PETSC_SUCCESS);
171: }
173: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
174: {
175: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
177: PetscFunctionBegin;
178: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
179: PetscFunctionReturn(PETSC_SUCCESS);
180: }
182: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
183: {
184: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
186: PetscFunctionBegin;
187: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
188: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
189: *array = aijkok->a_dual.view_host().data();
190: } else {
191: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
192: }
193: PetscFunctionReturn(PETSC_SUCCESS);
194: }
196: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
197: {
198: PetscFunctionBegin;
199: *array = NULL;
200: PetscFunctionReturn(PETSC_SUCCESS);
201: }
203: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
204: {
205: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
207: PetscFunctionBegin;
208: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
209: *array = aijkok->a_dual.view_host().data();
210: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
211: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
212: }
213: PetscFunctionReturn(PETSC_SUCCESS);
214: }
216: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
217: {
218: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
220: PetscFunctionBegin;
221: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
222: aijkok->a_dual.clear_sync_state();
223: aijkok->a_dual.modify_host();
224: }
225: PetscFunctionReturn(PETSC_SUCCESS);
226: }
228: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
229: {
230: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
232: PetscFunctionBegin;
233: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
235: if (i) *i = aijkok->i_device_data();
236: if (j) *j = aijkok->j_device_data();
237: if (a) {
238: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
239: *a = aijkok->a_device_data();
240: }
241: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
242: PetscFunctionReturn(PETSC_SUCCESS);
243: }
245: static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
246: {
247: PetscFunctionBegin;
248: *mtype = PETSC_MEMTYPE_KOKKOS;
249: PetscFunctionReturn(PETSC_SUCCESS);
250: }
252: /*
253: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
255: Input Parameter:
256: . A - the MATSEQAIJKOKKOS matrix
258: Output Parameters:
259: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
260: - T_d - the transpose on device, whose value array is allocated but not initialized
261: */
262: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
263: {
264: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
265: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
266: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
267: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
268: MatRowMapType *Ti = Ti_h.data();
269: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
270: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
271: PetscInt *Tj = Tj_h.data();
272: PetscInt *perm = perm_h.data();
273: PetscInt *offset;
275: PetscFunctionBegin;
276: // Populate Ti
277: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
278: Ti++;
279: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
280: Ti--;
281: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
283: // Populate Tj and the permutation array
284: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
285: for (PetscInt i = 0; i < m; i++) {
286: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
287: PetscInt r = Aj[j]; // row r of T
288: PetscInt disp = Ti[r] + offset[r];
290: Tj[disp] = i; // col i of T
291: perm[disp] = j;
292: offset[r]++;
293: }
294: }
295: PetscCall(PetscFree(offset));
297: // Sort each row of T, along with the permutation array
298: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
300: // Output perm and T on device
301: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
302: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
303: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
304: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
305: PetscFunctionReturn(PETSC_SUCCESS);
306: }
308: // Generate the transpose on device and cache it internally
309: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
310: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
311: {
312: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
313: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
314: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
315: KokkosCsrMatrix &T = akok->csrmatT;
317: PetscFunctionBegin;
318: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
319: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
321: const auto &Aa = akok->a_dual.view_device();
323: if (A->symmetric == PETSC_BOOL3_TRUE) {
324: *csrmatT = akok->csrmat;
325: } else {
326: // See if we already have a cached transpose and its value is up to date
327: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
328: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
329: const auto &perm = akok->transpose_perm; // get the permutation array
330: auto &Ta = T.values;
332: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
333: }
334: } else { // Generate T of size n x m for the first time
335: MatRowMapKokkosView perm;
337: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
338: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
339: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
340: }
341: akok->transpose_updated = PETSC_TRUE;
342: *csrmatT = akok->csrmatT;
343: }
344: PetscFunctionReturn(PETSC_SUCCESS);
345: }
347: // Generate the Hermitian on device and cache it internally
348: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
349: {
350: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
351: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
352: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
353: KokkosCsrMatrix &T = akok->csrmatH;
355: PetscFunctionBegin;
356: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
357: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
359: const auto &Aa = akok->a_dual.view_device();
361: if (A->hermitian == PETSC_BOOL3_TRUE) {
362: *csrmatH = akok->csrmat;
363: } else {
364: // See if we already have a cached Hermitian and its value is up to date
365: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
366: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
367: const auto &perm = akok->transpose_perm; // get the permutation array
368: auto &Ta = T.values;
370: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
371: }
372: } else { // Generate T of size n x m for the first time
373: MatRowMapKokkosView perm;
375: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
376: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
377: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
378: }
379: akok->hermitian_updated = PETSC_TRUE;
380: *csrmatH = akok->csrmatH;
381: }
382: PetscFunctionReturn(PETSC_SUCCESS);
383: }
385: /* y = A x */
386: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387: {
388: Mat_SeqAIJKokkos *aijkok;
389: ConstPetscScalarKokkosView xv;
390: PetscScalarKokkosView yv;
392: PetscFunctionBegin;
393: PetscCall(PetscLogGpuTimeBegin());
394: PetscCall(MatSeqAIJKokkosSyncDevice(A));
395: PetscCall(VecGetKokkosView(xx, &xv));
396: PetscCall(VecGetKokkosViewWrite(yy, &yv));
397: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
398: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
399: PetscCall(VecRestoreKokkosView(xx, &xv));
400: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
401: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
402: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
403: PetscCall(PetscLogGpuTimeEnd());
404: PetscFunctionReturn(PETSC_SUCCESS);
405: }
407: /* y = A^T x */
408: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
409: {
410: Mat_SeqAIJKokkos *aijkok;
411: const char *mode;
412: ConstPetscScalarKokkosView xv;
413: PetscScalarKokkosView yv;
414: KokkosCsrMatrix csrmat;
416: PetscFunctionBegin;
417: PetscCall(PetscLogGpuTimeBegin());
418: PetscCall(MatSeqAIJKokkosSyncDevice(A));
419: PetscCall(VecGetKokkosView(xx, &xv));
420: PetscCall(VecGetKokkosViewWrite(yy, &yv));
421: if (A->form_explicit_transpose) {
422: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
423: mode = "N";
424: } else {
425: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
426: csrmat = aijkok->csrmat;
427: mode = "T";
428: }
429: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
430: PetscCall(VecRestoreKokkosView(xx, &xv));
431: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
432: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
433: PetscCall(PetscLogGpuTimeEnd());
434: PetscFunctionReturn(PETSC_SUCCESS);
435: }
437: /* y = A^H x */
438: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
439: {
440: Mat_SeqAIJKokkos *aijkok;
441: const char *mode;
442: ConstPetscScalarKokkosView xv;
443: PetscScalarKokkosView yv;
444: KokkosCsrMatrix csrmat;
446: PetscFunctionBegin;
447: PetscCall(PetscLogGpuTimeBegin());
448: PetscCall(MatSeqAIJKokkosSyncDevice(A));
449: PetscCall(VecGetKokkosView(xx, &xv));
450: PetscCall(VecGetKokkosViewWrite(yy, &yv));
451: if (A->form_explicit_transpose) {
452: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
453: mode = "N";
454: } else {
455: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
456: csrmat = aijkok->csrmat;
457: mode = "C";
458: }
459: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
460: PetscCall(VecRestoreKokkosView(xx, &xv));
461: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
462: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
463: PetscCall(PetscLogGpuTimeEnd());
464: PetscFunctionReturn(PETSC_SUCCESS);
465: }
467: /* z = A x + y */
468: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
469: {
470: Mat_SeqAIJKokkos *aijkok;
471: ConstPetscScalarKokkosView xv;
472: PetscScalarKokkosView zv;
474: PetscFunctionBegin;
475: PetscCall(PetscLogGpuTimeBegin());
476: PetscCall(MatSeqAIJKokkosSyncDevice(A));
477: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
478: PetscCall(VecGetKokkosView(xx, &xv));
479: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
480: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
481: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
482: PetscCall(VecRestoreKokkosView(xx, &xv));
483: PetscCall(VecRestoreKokkosView(zz, &zv));
484: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
485: PetscCall(PetscLogGpuTimeEnd());
486: PetscFunctionReturn(PETSC_SUCCESS);
487: }
489: /* z = A^T x + y */
490: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
491: {
492: Mat_SeqAIJKokkos *aijkok;
493: const char *mode;
494: ConstPetscScalarKokkosView xv;
495: PetscScalarKokkosView zv;
496: KokkosCsrMatrix csrmat;
498: PetscFunctionBegin;
499: PetscCall(PetscLogGpuTimeBegin());
500: PetscCall(MatSeqAIJKokkosSyncDevice(A));
501: if (zz != yy) PetscCall(VecCopy(yy, zz));
502: PetscCall(VecGetKokkosView(xx, &xv));
503: PetscCall(VecGetKokkosView(zz, &zv));
504: if (A->form_explicit_transpose) {
505: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
506: mode = "N";
507: } else {
508: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
509: csrmat = aijkok->csrmat;
510: mode = "T";
511: }
512: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
513: PetscCall(VecRestoreKokkosView(xx, &xv));
514: PetscCall(VecRestoreKokkosView(zz, &zv));
515: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
516: PetscCall(PetscLogGpuTimeEnd());
517: PetscFunctionReturn(PETSC_SUCCESS);
518: }
520: /* z = A^H x + y */
521: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
522: {
523: Mat_SeqAIJKokkos *aijkok;
524: const char *mode;
525: ConstPetscScalarKokkosView xv;
526: PetscScalarKokkosView zv;
527: KokkosCsrMatrix csrmat;
529: PetscFunctionBegin;
530: PetscCall(PetscLogGpuTimeBegin());
531: PetscCall(MatSeqAIJKokkosSyncDevice(A));
532: if (zz != yy) PetscCall(VecCopy(yy, zz));
533: PetscCall(VecGetKokkosView(xx, &xv));
534: PetscCall(VecGetKokkosView(zz, &zv));
535: if (A->form_explicit_transpose) {
536: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
537: mode = "N";
538: } else {
539: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540: csrmat = aijkok->csrmat;
541: mode = "C";
542: }
543: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
544: PetscCall(VecRestoreKokkosView(xx, &xv));
545: PetscCall(VecRestoreKokkosView(zz, &zv));
546: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
547: PetscCall(PetscLogGpuTimeEnd());
548: PetscFunctionReturn(PETSC_SUCCESS);
549: }
551: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
552: {
553: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
555: PetscFunctionBegin;
556: switch (op) {
557: case MAT_FORM_EXPLICIT_TRANSPOSE:
558: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
559: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
560: A->form_explicit_transpose = flg;
561: break;
562: default:
563: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
564: break;
565: }
566: PetscFunctionReturn(PETSC_SUCCESS);
567: }
569: /* Depending on reuse, either build a new mat, or use the existing mat */
570: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
571: {
572: Mat_SeqAIJ *aseq;
574: PetscFunctionBegin;
575: PetscCall(PetscKokkosInitializeCheck());
576: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
577: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
578: PetscCall(MatSetType(*newmat, mtype));
579: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
580: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
581: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
582: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
583: PetscCall(PetscFree(A->defaultvectype));
584: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
585: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
586: PetscCall(MatSetOps_SeqAIJKokkos(A));
587: aseq = static_cast<Mat_SeqAIJ *>(A->data);
588: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
589: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
590: A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
591: }
592: }
593: PetscFunctionReturn(PETSC_SUCCESS);
594: }
596: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
597: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
598: */
599: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
600: {
601: Mat_SeqAIJ *bseq;
602: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
603: Mat mat;
605: PetscFunctionBegin;
606: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
607: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
608: mat = *B;
609: if (A->assembled) {
610: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
611: bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
612: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
613: /* Now copy values to B if needed */
614: if (dupOption == MAT_COPY_VALUES) {
615: if (akok->a_dual.need_sync_device()) {
616: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
617: bkok->a_dual.modify_host();
618: } else { /* If device has the latest data, we only copy data on device */
619: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
620: bkok->a_dual.modify_device();
621: }
622: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
623: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
624: bkok->a_dual.modify_host();
625: }
626: mat->spptr = bkok;
627: }
629: PetscCall(PetscFree(mat->defaultvectype));
630: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
631: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
632: PetscCall(MatSetOps_SeqAIJKokkos(mat));
633: PetscFunctionReturn(PETSC_SUCCESS);
634: }
636: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
637: {
638: Mat At;
639: KokkosCsrMatrix internT;
640: Mat_SeqAIJKokkos *atkok, *bkok;
642: PetscFunctionBegin;
643: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
644: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
645: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
646: /* Deep copy internT, as we want to isolate the internal transpose */
647: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
648: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
649: if (reuse == MAT_INITIAL_MATRIX) *B = At;
650: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
651: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
652: if ((*B)->assembled) {
653: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
654: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
655: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
656: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
657: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
658: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
659: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
660: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
661: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
662: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
663: }
664: PetscFunctionReturn(PETSC_SUCCESS);
665: }
667: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
668: {
669: Mat_SeqAIJKokkos *aijkok;
671: PetscFunctionBegin;
672: if (A->factortype == MAT_FACTOR_NONE) {
673: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
674: delete aijkok;
675: } else {
676: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
677: }
678: A->spptr = NULL;
679: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
680: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
681: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
682: #if defined(PETSC_HAVE_HYPRE)
683: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
684: #endif
685: PetscCall(MatDestroy_SeqAIJ(A));
686: PetscFunctionReturn(PETSC_SUCCESS);
687: }
689: /*MC
690: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
692: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
694: Options Database Key:
695: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
697: Level: beginner
699: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
700: M*/
701: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
702: {
703: PetscFunctionBegin;
704: PetscCall(PetscKokkosInitializeCheck());
705: PetscCall(MatCreate_SeqAIJ(A));
706: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
707: PetscFunctionReturn(PETSC_SUCCESS);
708: }
710: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
711: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
712: {
713: Mat_SeqAIJ *a, *b;
714: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
715: MatScalarKokkosView aa, ba, ca;
716: MatRowMapKokkosView ai, bi, ci;
717: MatColIdxKokkosView aj, bj, cj;
718: PetscInt m, n, nnz, aN;
720: PetscFunctionBegin;
723: PetscAssertPointer(C, 4);
724: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
725: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
726: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
727: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
729: PetscCall(MatSeqAIJKokkosSyncDevice(A));
730: PetscCall(MatSeqAIJKokkosSyncDevice(B));
731: a = static_cast<Mat_SeqAIJ *>(A->data);
732: b = static_cast<Mat_SeqAIJ *>(B->data);
733: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
734: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
735: aa = akok->a_dual.view_device();
736: ai = akok->i_dual.view_device();
737: ba = bkok->a_dual.view_device();
738: bi = bkok->i_dual.view_device();
739: m = A->rmap->n; /* M, N and nnz of C */
740: n = A->cmap->n + B->cmap->n;
741: nnz = a->nz + b->nz;
742: aN = A->cmap->n; /* N of A */
743: if (reuse == MAT_INITIAL_MATRIX) {
744: aj = akok->j_dual.view_device();
745: bj = bkok->j_dual.view_device();
746: auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
747: auto ci = MatRowMapKokkosView("i", ai.extent(0));
748: auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
750: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
751: Kokkos::parallel_for(
752: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753: PetscInt i = t.league_rank(); /* row i */
754: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
756: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
757: ci(i) = coffset;
758: if (i == m - 1) ci(m) = ai(m) + bi(m);
759: });
761: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
762: if (k < alen) {
763: ca(coffset + k) = aa(ai(i) + k);
764: cj(coffset + k) = aj(ai(i) + k);
765: } else {
766: ca(coffset + k) = ba(bi(i) + k - alen);
767: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
768: }
769: });
770: });
771: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
772: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
773: } else if (reuse == MAT_REUSE_MATRIX) {
775: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
776: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
777: ca = ckok->a_dual.view_device();
778: ci = ckok->i_dual.view_device();
780: Kokkos::parallel_for(
781: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
782: PetscInt i = t.league_rank(); /* row i */
783: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
784: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
785: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
786: else ca(ci(i) + k) = ba(bi(i) + k - alen);
787: });
788: });
789: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
790: }
791: PetscFunctionReturn(PETSC_SUCCESS);
792: }
794: static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata)
795: {
796: PetscFunctionBegin;
797: delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
798: PetscFunctionReturn(PETSC_SUCCESS);
799: }
801: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
802: {
803: Mat_Product *product = C->product;
804: Mat A, B;
805: bool transA, transB; /* use bool, since KK needs this type */
806: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
807: Mat_SeqAIJ *c;
808: MatProductCtx_SeqAIJKokkos *pdata;
809: KokkosCsrMatrix csrmatA, csrmatB;
811: PetscFunctionBegin;
812: MatCheckProduct(C, 1);
813: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
814: pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
816: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
817: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
818: // we still do numeric.
819: if (pdata->reusesym) { // numeric reuses results from symbolic
820: pdata->reusesym = PETSC_FALSE;
821: PetscFunctionReturn(PETSC_SUCCESS);
822: }
824: switch (product->type) {
825: case MATPRODUCT_AB:
826: transA = false;
827: transB = false;
828: break;
829: case MATPRODUCT_AtB:
830: transA = true;
831: transB = false;
832: break;
833: case MATPRODUCT_ABt:
834: transA = false;
835: transB = true;
836: break;
837: default:
838: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
839: }
841: A = product->A;
842: B = product->B;
843: PetscCall(MatSeqAIJKokkosSyncDevice(A));
844: PetscCall(MatSeqAIJKokkosSyncDevice(B));
845: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
846: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
847: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
849: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
851: csrmatA = akok->csrmat;
852: csrmatB = bkok->csrmat;
854: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
855: if (transA) {
856: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
857: transA = false;
858: }
860: if (transB) {
861: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
862: transB = false;
863: }
864: PetscCall(PetscLogGpuTimeBegin());
865: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
866: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
867: auto spgemmHandle = pdata->kh.get_spgemm_handle();
868: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
869: #endif
871: PetscCall(PetscLogGpuTimeEnd());
872: PetscCall(MatSeqAIJKokkosModifyDevice(C));
873: /* shorter version of MatAssemblyEnd_SeqAIJ */
874: c = (Mat_SeqAIJ *)C->data;
875: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
876: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
877: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
878: c->reallocs = 0;
879: C->info.mallocs = 0;
880: C->info.nz_unneeded = 0;
881: C->assembled = C->was_assembled = PETSC_TRUE;
882: C->num_ass++;
883: PetscFunctionReturn(PETSC_SUCCESS);
884: }
886: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
887: {
888: Mat_Product *product = C->product;
889: MatProductType ptype;
890: Mat A, B;
891: bool transA, transB;
892: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
893: MatProductCtx_SeqAIJKokkos *pdata;
894: MPI_Comm comm;
895: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
897: PetscFunctionBegin;
898: MatCheckProduct(C, 1);
899: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
900: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
901: A = product->A;
902: B = product->B;
903: PetscCall(MatSeqAIJKokkosSyncDevice(A));
904: PetscCall(MatSeqAIJKokkosSyncDevice(B));
905: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
907: csrmatA = akok->csrmat;
908: csrmatB = bkok->csrmat;
910: ptype = product->type;
911: // Take advantage of the symmetry if true
912: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
913: ptype = MATPRODUCT_AB;
914: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
915: }
916: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
917: ptype = MATPRODUCT_AB;
918: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
919: }
921: switch (ptype) {
922: case MATPRODUCT_AB:
923: transA = false;
924: transB = false;
925: PetscCall(MatSetBlockSizesFromMats(C, A, B));
926: break;
927: case MATPRODUCT_AtB:
928: transA = true;
929: transB = false;
930: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
931: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
932: break;
933: case MATPRODUCT_ABt:
934: transA = false;
935: transB = true;
936: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
937: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
938: break;
939: default:
940: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
941: }
942: PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
943: pdata->reusesym = product->api_user;
945: /* TODO: add command line options to select spgemm algorithms */
946: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
948: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
949: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
950: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
951: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
952: #endif
953: #endif
954: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
956: PetscCall(PetscLogGpuTimeBegin());
957: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
958: if (transA) {
959: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
960: transA = false;
961: }
963: if (transB) {
964: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
965: transB = false;
966: }
968: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
969: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
970: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
971: calling new Mat_SeqAIJKokkos().
972: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
973: */
974: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
975: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
976: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
977: auto spgemmHandle = pdata->kh.get_spgemm_handle();
978: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
979: #endif
980: PetscCall(PetscLogGpuTimeEnd());
982: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
983: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
984: C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
985: PetscFunctionReturn(PETSC_SUCCESS);
986: }
988: /* handles sparse matrix matrix ops */
989: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
990: {
991: Mat_Product *product = mat->product;
992: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
994: PetscFunctionBegin;
995: MatCheckProduct(mat, 1);
996: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
997: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
998: if (Biskok && Ciskok) {
999: switch (product->type) {
1000: case MATPRODUCT_AB:
1001: case MATPRODUCT_AtB:
1002: case MATPRODUCT_ABt:
1003: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1004: break;
1005: case MATPRODUCT_PtAP:
1006: case MATPRODUCT_RARt:
1007: case MATPRODUCT_ABC:
1008: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1009: break;
1010: default:
1011: break;
1012: }
1013: } else { /* fallback for AIJ */
1014: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1015: }
1016: PetscFunctionReturn(PETSC_SUCCESS);
1017: }
1019: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1020: {
1021: Mat_SeqAIJKokkos *aijkok;
1023: PetscFunctionBegin;
1024: PetscCall(PetscLogGpuTimeBegin());
1025: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1026: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1027: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1028: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1029: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1030: PetscCall(PetscLogGpuTimeEnd());
1031: PetscFunctionReturn(PETSC_SUCCESS);
1032: }
1034: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1035: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1036: {
1037: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1039: PetscFunctionBegin;
1040: if (A->assembled && aijseq->diagDense) { // no missing diagonals
1041: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1043: PetscCall(PetscLogGpuTimeBegin());
1044: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046: const auto &Aa = aijkok->a_dual.view_device();
1047: const auto &Adiag = aijkok->diag_dual.view_device();
1048: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1049: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1050: PetscCall(PetscLogGpuFlops(n));
1051: PetscCall(PetscLogGpuTimeEnd());
1052: } else { // need reassembly, very slow!
1053: PetscCall(MatShift_Basic(A, a));
1054: }
1055: PetscFunctionReturn(PETSC_SUCCESS);
1056: }
1058: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1059: {
1060: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1062: PetscFunctionBegin;
1063: if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1064: ConstPetscScalarKokkosView dv;
1065: PetscInt n, nv;
1067: PetscCall(PetscLogGpuTimeBegin());
1068: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1069: PetscCall(VecGetKokkosView(D, &dv));
1070: PetscCall(VecGetLocalSize(D, &nv));
1071: n = PetscMin(Y->rmap->n, Y->cmap->n);
1072: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1074: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1075: const auto &Aa = aijkok->a_dual.view_device();
1076: const auto &Adiag = aijkok->diag_dual.view_device();
1077: PetscCallCXX(Kokkos::parallel_for(
1078: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1079: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1080: else Aa(Adiag(i)) += dv(i);
1081: }));
1082: PetscCall(VecRestoreKokkosView(D, &dv));
1083: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1084: PetscCall(PetscLogGpuFlops(n));
1085: PetscCall(PetscLogGpuTimeEnd());
1086: } else { // need reassembly, very slow!
1087: PetscCall(MatDiagonalSet_Default(Y, D, is));
1088: }
1089: PetscFunctionReturn(PETSC_SUCCESS);
1090: }
1092: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1093: {
1094: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1095: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1096: ConstPetscScalarKokkosView lv, rv;
1098: PetscFunctionBegin;
1099: PetscCall(PetscLogGpuTimeBegin());
1100: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1101: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1102: const auto &Aa = aijkok->a_dual.view_device();
1103: const auto &Ai = aijkok->i_dual.view_device();
1104: const auto &Aj = aijkok->j_dual.view_device();
1105: if (ll) {
1106: PetscCall(VecGetLocalSize(ll, &m));
1107: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1108: PetscCall(VecGetKokkosView(ll, &lv));
1109: PetscCallCXX(Kokkos::parallel_for( // for each row
1110: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1111: PetscInt i = t.league_rank(); // row i
1112: PetscInt len = Ai(i + 1) - Ai(i);
1113: // scale entries on the row
1114: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1115: }));
1116: PetscCall(VecRestoreKokkosView(ll, &lv));
1117: PetscCall(PetscLogGpuFlops(nz));
1118: }
1119: if (rr) {
1120: PetscCall(VecGetLocalSize(rr, &n));
1121: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1122: PetscCall(VecGetKokkosView(rr, &rv));
1123: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1124: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1125: PetscCall(VecRestoreKokkosView(rr, &lv));
1126: PetscCall(PetscLogGpuFlops(nz));
1127: }
1128: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129: PetscCall(PetscLogGpuTimeEnd());
1130: PetscFunctionReturn(PETSC_SUCCESS);
1131: }
1133: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1134: {
1135: Mat_SeqAIJKokkos *aijkok;
1137: PetscFunctionBegin;
1138: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139: if (aijkok) { /* Only zero the device if data is already there */
1140: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1141: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1142: } else { /* Might be preallocated but not assembled */
1143: PetscCall(MatZeroEntries_SeqAIJ(A));
1144: }
1145: PetscFunctionReturn(PETSC_SUCCESS);
1146: }
1148: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1149: {
1150: Mat_SeqAIJKokkos *aijkok;
1151: PetscInt n;
1152: PetscScalarKokkosView xv;
1154: PetscFunctionBegin;
1155: PetscCall(VecGetLocalSize(x, &n));
1156: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1157: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1159: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1160: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1162: const auto &Aa = aijkok->a_dual.view_device();
1163: const auto &Ai = aijkok->i_dual.view_device();
1164: const auto &Adiag = aijkok->diag_dual.view_device();
1166: PetscCall(VecGetKokkosViewWrite(x, &xv));
1167: Kokkos::parallel_for(
1168: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1169: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1170: else xv(i) = 0;
1171: });
1172: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1173: PetscFunctionReturn(PETSC_SUCCESS);
1174: }
1176: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1177: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178: {
1179: Mat_SeqAIJKokkos *aijkok;
1181: PetscFunctionBegin;
1183: PetscAssertPointer(kv, 2);
1184: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1186: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1187: *kv = aijkok->a_dual.view_device();
1188: PetscFunctionReturn(PETSC_SUCCESS);
1189: }
1191: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1192: {
1193: PetscFunctionBegin;
1195: PetscAssertPointer(kv, 2);
1196: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1197: PetscFunctionReturn(PETSC_SUCCESS);
1198: }
1200: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1201: {
1202: Mat_SeqAIJKokkos *aijkok;
1204: PetscFunctionBegin;
1206: PetscAssertPointer(kv, 2);
1207: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1208: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1209: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1210: *kv = aijkok->a_dual.view_device();
1211: PetscFunctionReturn(PETSC_SUCCESS);
1212: }
1214: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1215: {
1216: PetscFunctionBegin;
1218: PetscAssertPointer(kv, 2);
1219: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1220: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225: {
1226: Mat_SeqAIJKokkos *aijkok;
1228: PetscFunctionBegin;
1230: PetscAssertPointer(kv, 2);
1231: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1232: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1233: *kv = aijkok->a_dual.view_device();
1234: PetscFunctionReturn(PETSC_SUCCESS);
1235: }
1237: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1238: {
1239: PetscFunctionBegin;
1241: PetscAssertPointer(kv, 2);
1242: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1243: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1244: PetscFunctionReturn(PETSC_SUCCESS);
1245: }
1247: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1248: {
1249: Mat_SeqAIJKokkos *akok;
1251: PetscFunctionBegin;
1252: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1253: PetscCall(MatCreate(comm, A));
1254: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255: PetscFunctionReturn(PETSC_SUCCESS);
1256: }
1258: /* Computes Y += alpha X */
1259: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260: {
1261: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1263: ConstMatScalarKokkosView Xa;
1264: MatScalarKokkosView Ya;
1265: auto exec = PetscGetKokkosExecutionSpace();
1267: PetscFunctionBegin;
1268: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272: PetscCall(PetscLogGpuTimeBegin());
1274: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275: PetscBool e;
1276: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277: if (e) {
1278: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279: if (e) pattern = SAME_NONZERO_PATTERN;
1280: }
1281: }
1283: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287: */
1288: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290: Xa = xkok->a_dual.view_device();
1291: Ya = ykok->a_dual.view_device();
1293: if (pattern == SAME_NONZERO_PATTERN) {
1294: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1300: Kokkos::parallel_for(
1301: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302: PetscInt i = t.league_rank(); // row i
1303: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304: // Only one thread works in a team
1305: PetscInt p, q = Yi(i);
1306: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1307: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308: if (Xj(p) == Yj(q)) { // Found it
1309: Ya(q) += alpha * Xa(p);
1310: q++;
1311: } else {
1312: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314: #if PETSC_PKG_KOKKOS_VERSION_GE(5, 0, 0)
1315: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = KokkosKernels::ArithTraits<PetscScalar>::nan();
1316: #elif PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1317: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1318: #else
1319: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1320: #endif
1321: }
1322: }
1323: });
1324: });
1325: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1326: } else { // different nonzero patterns
1327: Mat Z;
1328: KokkosCsrMatrix zcsr;
1329: KernelHandle kh;
1330: kh.create_spadd_handle(true); // X, Y are sorted
1331: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1332: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1333: zkok = new Mat_SeqAIJKokkos(zcsr);
1334: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1335: PetscCall(MatHeaderReplace(Y, &Z));
1336: kh.destroy_spadd_handle();
1337: }
1338: PetscCall(PetscLogGpuTimeEnd());
1339: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1340: PetscFunctionReturn(PETSC_SUCCESS);
1341: }
1343: struct MatCOOStruct_SeqAIJKokkos {
1344: PetscCount n;
1345: PetscCount Atot;
1346: PetscInt nz;
1347: PetscCountKokkosView jmap;
1348: PetscCountKokkosView perm;
1350: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1351: {
1352: nz = coo_h->nz;
1353: n = coo_h->n;
1354: Atot = coo_h->Atot;
1355: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1356: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1357: }
1358: };
1360: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1361: {
1362: PetscFunctionBegin;
1363: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1364: PetscFunctionReturn(PETSC_SUCCESS);
1365: }
1367: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1368: {
1369: Mat_SeqAIJKokkos *akok;
1370: Mat_SeqAIJ *aseq;
1371: PetscContainer container_h;
1372: MatCOOStruct_SeqAIJ *coo_h;
1373: MatCOOStruct_SeqAIJKokkos *coo_d;
1375: PetscFunctionBegin;
1376: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1377: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1378: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1379: delete akok;
1380: mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1381: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1383: // Copy the COO struct to device
1384: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1385: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1386: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1388: // Put the COO struct in a container and then attach that to the matrix
1389: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1390: PetscFunctionReturn(PETSC_SUCCESS);
1391: }
1393: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1394: {
1395: MatScalarKokkosView Aa;
1396: ConstMatScalarKokkosView kv;
1397: PetscMemType memtype;
1398: PetscContainer container;
1399: MatCOOStruct_SeqAIJKokkos *coo;
1401: PetscFunctionBegin;
1402: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1403: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1405: const auto &n = coo->n;
1406: const auto &Annz = coo->nz;
1407: const auto &jmap = coo->jmap;
1408: const auto &perm = coo->perm;
1410: PetscCall(PetscGetMemType(v, &memtype));
1411: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1412: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1413: } else {
1414: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1415: }
1417: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1418: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1420: PetscCall(PetscLogGpuTimeBegin());
1421: Kokkos::parallel_for(
1422: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1423: PetscScalar sum = 0.0;
1424: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1425: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1426: });
1427: PetscCall(PetscLogGpuTimeEnd());
1429: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1430: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1431: PetscFunctionReturn(PETSC_SUCCESS);
1432: }
1434: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1435: {
1436: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1438: PetscFunctionBegin;
1439: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1440: A->boundtocpu = PETSC_FALSE;
1442: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1443: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1444: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1445: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1446: A->ops->scale = MatScale_SeqAIJKokkos;
1447: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1448: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1449: A->ops->mult = MatMult_SeqAIJKokkos;
1450: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1451: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1452: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1453: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1454: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1455: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1456: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1457: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1458: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1459: A->ops->shift = MatShift_SeqAIJKokkos;
1460: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1461: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1462: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos;
1463: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1464: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1465: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1466: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1467: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1468: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1469: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1471: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1472: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1473: #if defined(PETSC_HAVE_HYPRE)
1474: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1475: #endif
1476: PetscFunctionReturn(PETSC_SUCCESS);
1477: }
1479: /*
1480: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1482: Input Parameters:
1483: + A - the MATSEQAIJKOKKOS matrix
1484: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1485: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1486: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1487: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1489: Output Parameter:
1490: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1491: */
1492: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1493: {
1494: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1495: PetscInt nblocks = bs.extent(0) - 1;
1497: PetscFunctionBegin;
1498: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1500: // Pull out the diagonal blocks of the matrix and then invert the blocks
1501: auto Aa = akok->a_dual.view_device();
1502: auto Ai = akok->i_dual.view_device();
1503: auto Aj = akok->j_dual.view_device();
1504: auto Adiag = akok->diag_dual.view_device();
1505: // TODO: how to tune the team size?
1506: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1507: auto ts = Kokkos::AUTO();
1508: #else
1509: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1510: #endif
1511: PetscCallCXX(Kokkos::parallel_for(
1512: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1513: const PetscInt bid = teamMember.league_rank(); // block id
1514: const PetscInt rstart = bs(bid); // this block starts from this row
1515: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1516: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1517: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1519: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1520: PetscInt i = rstart + r; // i-th row in A
1522: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1523: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1525: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1526: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1527: B(r, c) = 0.0;
1528: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1529: B(r, c) = Aa(first + c);
1530: } else { // this entry does not show up in the CSR
1531: B(r, c) = 0.0;
1532: }
1533: }
1534: } else { // rare case that the diagonal does not exist
1535: const PetscInt begin = Ai(i);
1536: const PetscInt end = Ai(i + 1);
1537: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1538: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1539: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1540: else if (Aj(j) >= rstart + m) break;
1541: }
1542: }
1543: });
1545: // LU-decompose B (w/o pivoting) and then invert B
1546: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1547: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1548: }));
1549: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1550: PetscFunctionReturn(PETSC_SUCCESS);
1551: }
1553: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1554: {
1555: Mat_SeqAIJ *aseq;
1556: PetscInt i, m, n;
1557: auto exec = PetscGetKokkosExecutionSpace();
1559: PetscFunctionBegin;
1560: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1562: m = akok->nrows();
1563: n = akok->ncols();
1564: PetscCall(MatSetSizes(A, m, n, m, n));
1565: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1567: /* Set up data structures of A as a MATSEQAIJ */
1568: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1569: aseq = (Mat_SeqAIJ *)A->data;
1571: PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1572: PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1574: aseq->i = akok->i_host_data();
1575: aseq->j = akok->j_host_data();
1576: aseq->a = akok->a_host_data();
1577: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1578: aseq->free_a = PETSC_FALSE;
1579: aseq->free_ij = PETSC_FALSE;
1580: aseq->nz = akok->nnz();
1581: aseq->maxnz = aseq->nz;
1583: PetscCall(PetscMalloc1(m, &aseq->imax));
1584: PetscCall(PetscMalloc1(m, &aseq->ilen));
1585: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1587: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1588: akok->nonzerostate = A->nonzerostate;
1589: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1590: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1591: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1592: PetscFunctionReturn(PETSC_SUCCESS);
1593: }
1595: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1596: {
1597: PetscFunctionBegin;
1598: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1599: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1600: PetscFunctionReturn(PETSC_SUCCESS);
1601: }
1603: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1604: {
1605: Mat_SeqAIJKokkos *akok;
1607: PetscFunctionBegin;
1608: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1609: PetscCall(MatCreate(comm, A));
1610: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1611: PetscFunctionReturn(PETSC_SUCCESS);
1612: }
1614: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1616: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1617: */
1618: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1619: {
1620: PetscFunctionBegin;
1621: PetscCall(MatCreate(comm, A));
1622: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1623: PetscFunctionReturn(PETSC_SUCCESS);
1624: }
1626: /*@C
1627: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1628: (the default parallel PETSc format). This matrix will ultimately be handled by
1629: Kokkos for calculations.
1631: Collective
1633: Input Parameters:
1634: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1635: . m - number of rows
1636: . n - number of columns
1637: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1638: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1640: Output Parameter:
1641: . A - the matrix
1643: Level: intermediate
1645: Notes:
1646: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1647: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1650: The AIJ format, also called
1651: compressed row storage, is fully compatible with standard Fortran
1652: storage. That is, the stored row and column indices can begin at
1653: either one (as in Fortran) or zero.
1655: Specify the preallocated storage with either `nz` or `nnz` (not both).
1656: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1657: allocation.
1659: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1660: @*/
1661: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1662: {
1663: PetscFunctionBegin;
1664: PetscCall(PetscKokkosInitializeCheck());
1665: PetscCall(MatCreate(comm, A));
1666: PetscCall(MatSetSizes(*A, m, n, m, n));
1667: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1668: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1669: PetscFunctionReturn(PETSC_SUCCESS);
1670: }
1672: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1673: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1674: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1675: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1676: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1677: {
1678: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1679: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1680: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1682: PetscFunctionBegin;
1683: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1684: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1685: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1686: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1687: }
1688: PetscFunctionReturn(PETSC_SUCCESS);
1689: }
1691: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1692: {
1693: const PetscInt n = A->rmap->n;
1694: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1695: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1696: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1698: PetscFunctionBegin;
1699: if (!factors->transpose_updated) {
1700: if (has_upper) {
1701: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1702: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1703: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1704: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1705: }
1707: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1708: if (!factors->U) {
1709: Mat_SeqAIJ *seq;
1711: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1712: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1714: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1715: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1716: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1717: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1718: } else {
1719: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1720: }
1721: // Copy Ut from host to device
1722: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1723: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1724: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1725: } else { // If U was computed on device, we also compute the transpose there
1726: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1727: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1728: factors->jU_d, factors->aU_d,
1729: factors->iUt_d, factors->jUt_d,
1730: factors->aUt_d));
1731: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732: }
1733: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1734: }
1736: // do the same for L with LU
1737: if (has_lower) {
1738: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1739: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1740: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1741: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1742: }
1744: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1745: if (!factors->L) {
1746: Mat_SeqAIJ *seq;
1748: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1749: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1751: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1752: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1753: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1754: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1755: } else {
1756: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1757: }
1758: // Copy Lt from host to device
1759: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1760: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1761: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1762: } else { // If L was computed on device, we also compute the transpose there
1763: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1764: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1765: factors->jL_d, factors->aL_d,
1766: factors->iLt_d, factors->jLt_d,
1767: factors->aLt_d));
1768: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769: }
1770: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1771: }
1773: factors->transpose_updated = PETSC_TRUE;
1774: }
1775: PetscFunctionReturn(PETSC_SUCCESS);
1776: }
1778: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1779: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1780: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1781: {
1782: auto exec = PetscGetKokkosExecutionSpace();
1783: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1784: PetscInt m = A->rmap->n;
1785: PetscScalarKokkosView D = factors->D_d;
1786: PetscScalarKokkosView X, Y, B; // alias
1787: ConstPetscScalarKokkosView b;
1788: PetscScalarKokkosView x;
1789: PetscIntKokkosView &rowperm = factors->rowperm;
1790: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1792: PetscFunctionBegin;
1793: PetscCall(PetscLogGpuTimeBegin());
1794: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1795: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1796: PetscCall(VecGetKokkosView(bb, &b));
1797: PetscCall(VecGetKokkosViewWrite(xx, &x));
1799: // Solve U^T Y = B
1800: if (identity) { // Reorder b with the row permutation
1801: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1802: Y = factors->workVector;
1803: } else {
1804: B = factors->workVector;
1805: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1806: Y = x;
1807: }
1808: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1810: // Solve diag(D) Y' = Y.
1811: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1812: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1814: // Solve UX = Y
1815: if (identity) {
1816: X = x;
1817: } else {
1818: X = factors->workVector; // B is not needed anymore
1819: }
1820: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1822: // Reorder X with the inverse column (row) permutation
1823: if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1825: PetscCall(VecRestoreKokkosView(bb, &b));
1826: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1827: PetscCall(PetscLogGpuTimeEnd());
1828: PetscFunctionReturn(PETSC_SUCCESS);
1829: }
1831: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1832: // R and C are represented by rowperm and colperm in factors.
1833: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1834: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1835: {
1836: auto exec = PetscGetKokkosExecutionSpace();
1837: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1838: PetscInt m = A->rmap->n;
1839: PetscScalarKokkosView X, Y, B; // alias
1840: ConstPetscScalarKokkosView b;
1841: PetscScalarKokkosView x;
1842: PetscIntKokkosView &rowperm = factors->rowperm;
1843: PetscIntKokkosView &colperm = factors->colperm;
1844: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1847: PetscFunctionBegin;
1848: PetscCall(PetscLogGpuTimeBegin());
1849: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1850: PetscCall(VecGetKokkosView(bb, &b));
1851: PetscCall(VecGetKokkosViewWrite(xx, &x));
1853: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1854: if (row_identity) {
1855: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1856: Y = factors->workVector;
1857: } else {
1858: B = factors->workVector;
1859: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1860: Y = x;
1861: }
1862: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1864: // Solve U C^- x = Y
1865: if (col_identity) {
1866: X = x;
1867: } else {
1868: X = factors->workVector;
1869: }
1870: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1872: // x = C X; Reorder X with the inverse col permutation
1873: if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1875: PetscCall(VecRestoreKokkosView(bb, &b));
1876: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1877: PetscCall(PetscLogGpuTimeEnd());
1878: PetscFunctionReturn(PETSC_SUCCESS);
1879: }
1881: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1882: // R and C are represented by rowperm and colperm in factors.
1883: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1884: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1885: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1886: {
1887: auto exec = PetscGetKokkosExecutionSpace();
1888: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1889: PetscInt m = A->rmap->n;
1890: PetscScalarKokkosView X, Y, B; // alias
1891: ConstPetscScalarKokkosView b;
1892: PetscScalarKokkosView x;
1893: PetscIntKokkosView &rowperm = factors->rowperm;
1894: PetscIntKokkosView &colperm = factors->colperm;
1895: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1896: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1898: PetscFunctionBegin;
1899: PetscCall(PetscLogGpuTimeBegin());
1900: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1901: PetscCall(VecGetKokkosView(bb, &b));
1902: PetscCall(VecGetKokkosViewWrite(xx, &x));
1904: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1905: if (col_identity) { // Reorder b with the col permutation
1906: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1907: Y = factors->workVector;
1908: } else {
1909: B = factors->workVector;
1910: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1911: Y = x;
1912: }
1913: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1915: // Solve L^T X = Y
1916: if (row_identity) {
1917: X = x;
1918: } else {
1919: X = factors->workVector;
1920: }
1921: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1923: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1924: if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1926: PetscCall(VecRestoreKokkosView(bb, &b));
1927: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1928: PetscCall(PetscLogGpuTimeEnd());
1929: PetscFunctionReturn(PETSC_SUCCESS);
1930: }
1932: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1933: {
1934: PetscFunctionBegin;
1935: PetscCall(MatSeqAIJKokkosSyncHost(A));
1936: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1938: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1939: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1940: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1941: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1942: const MatScalar *Ba = b->a;
1943: PetscInt m = B->rmap->n, n = B->cmap->n;
1945: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1946: // Allocate memory and copy the structure
1947: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1948: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1949: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1950: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1951: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1952: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1954: PetscInt *Li = factors->iL_h.data();
1955: PetscInt *Lj = factors->jL_h.data();
1956: PetscInt *Ui = factors->iU_h.data();
1957: PetscInt *Uj = factors->jU_h.data();
1959: Li[0] = Ui[0] = 0;
1960: for (PetscInt i = 0; i < m; i++) {
1961: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1962: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1964: PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1965: Lj[Li[i] + llen] = i; // diagonal entry of L
1967: Uj[Ui[i]] = i; // diagonal entry of U
1968: PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
1970: Li[i + 1] = Li[i] + llen + 1;
1971: Ui[i + 1] = Ui[i] + ulen;
1972: }
1974: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1975: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1976: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1977: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1978: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1979: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1981: // Copy row/col permutation to device
1982: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1983: PetscBool row_identity;
1984: PetscCall(ISIdentity(rowperm, &row_identity));
1985: if (!row_identity) {
1986: const PetscInt *ip;
1988: PetscCall(ISGetIndices(rowperm, &ip));
1989: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1990: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1991: PetscCall(ISRestoreIndices(rowperm, &ip));
1992: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1993: }
1995: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
1996: PetscBool col_identity;
1997: PetscCall(ISIdentity(colperm, &col_identity));
1998: if (!col_identity) {
1999: const PetscInt *ip;
2001: PetscCall(ISGetIndices(colperm, &ip));
2002: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2003: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2004: PetscCall(ISRestoreIndices(colperm, &ip));
2005: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2006: }
2008: /* Create sptrsv handles for L, U and their transpose */
2009: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2010: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2011: #else
2012: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2013: #endif
2014: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2015: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2016: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2017: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2018: }
2020: // Copy the value
2021: for (PetscInt i = 0; i < m; i++) {
2022: PetscInt llen = Bi[i + 1] - Bi[i];
2023: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2024: const PetscInt *Li = factors->iL_h.data();
2025: const PetscInt *Ui = factors->iU_h.data();
2027: PetscScalar *La = factors->aL_h.data();
2028: PetscScalar *Ua = factors->aU_h.data();
2030: PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2031: La[Li[i] + llen] = 1.0; // diagonal entry
2033: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2034: PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2035: }
2037: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2038: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2039: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2040: factors->transpose_updated = PETSC_FALSE;
2041: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2043: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2044: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2045: }
2047: B->ops->matsolve = NULL;
2048: B->ops->matsolvetranspose = NULL;
2049: PetscFunctionReturn(PETSC_SUCCESS);
2050: }
2052: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2053: {
2054: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2055: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2056: PetscInt fill_lev = info->levels;
2058: PetscFunctionBegin;
2059: PetscCall(PetscLogGpuTimeBegin());
2060: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2061: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2063: auto a_d = aijkok->a_dual.view_device();
2064: auto i_d = aijkok->i_dual.view_device();
2065: auto j_d = aijkok->j_dual.view_device();
2067: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2069: B->assembled = PETSC_TRUE;
2070: B->preallocated = PETSC_TRUE;
2071: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2072: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2073: B->ops->matsolve = NULL;
2074: B->ops->matsolvetranspose = NULL;
2076: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2077: factors->transpose_updated = PETSC_FALSE;
2078: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2079: /* TODO: log flops, but how to know that? */
2080: PetscCall(PetscLogGpuTimeEnd());
2081: PetscFunctionReturn(PETSC_SUCCESS);
2082: }
2084: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2085: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2086: {
2087: Mat_SeqAIJKokkos *aijkok;
2088: Mat_SeqAIJ *b;
2089: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2090: PetscInt fill_lev = info->levels;
2091: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2092: PetscInt n = A->rmap->n;
2094: PetscFunctionBegin;
2095: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2096: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2098: /* Create a spiluk handle and then do symbolic factorization */
2099: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2100: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2102: auto spiluk_handle = factors->kh.get_spiluk_handle();
2104: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2105: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2106: Kokkos::realloc(factors->iU_d, n + 1);
2107: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2109: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2110: auto i_d = aijkok->i_dual.view_device();
2111: auto j_d = aijkok->j_dual.view_device();
2112: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2113: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2115: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2116: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2117: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2118: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2120: /* TODO: add options to select sptrsv algorithms */
2121: /* Create sptrsv handles for L, U and their transpose */
2122: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2123: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2124: #else
2125: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2126: #endif
2128: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2129: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2130: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2131: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2133: /* Fill fields of the factor matrix B */
2134: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2135: b = (Mat_SeqAIJ *)B->data;
2136: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2137: B->info.fill_ratio_given = info->fill;
2138: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2140: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2141: PetscFunctionReturn(PETSC_SUCCESS);
2142: }
2144: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2145: {
2146: PetscFunctionBegin;
2147: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2148: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2149: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2150: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2151: PetscFunctionReturn(PETSC_SUCCESS);
2152: }
2154: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2155: {
2156: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2158: PetscFunctionBegin;
2159: if (!info->factoronhost) {
2160: PetscCall(ISIdentity(isrow, &row_identity));
2161: PetscCall(ISIdentity(iscol, &col_identity));
2162: }
2164: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2165: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2167: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2168: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2169: } else {
2170: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2171: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2172: }
2173: PetscFunctionReturn(PETSC_SUCCESS);
2174: }
2176: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2177: {
2178: PetscFunctionBegin;
2179: PetscCall(MatSeqAIJKokkosSyncHost(A));
2180: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2182: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2183: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2184: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2185: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2186: const MatScalar *Ba = b->a;
2187: PetscInt m = B->rmap->n;
2189: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2190: // Allocate memory and copy the structure
2191: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2192: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2193: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2194: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2195: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2196: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2198: // Build jU_h from the skewed Aj
2199: PetscInt *Uj = factors->jU_h.data();
2200: for (PetscInt i = 0; i < m; i++) {
2201: PetscInt ulen = Bi[i + 1] - Bi[i];
2202: Uj[Bi[i]] = i; // diagonal entry
2203: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2204: }
2206: // Copy iU, jU to device
2207: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2208: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2210: // Copy row/col permutation to device
2211: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2212: PetscBool row_identity;
2213: PetscCall(ISIdentity(rowperm, &row_identity));
2214: if (!row_identity) {
2215: const PetscInt *ip;
2217: PetscCall(ISGetIndices(rowperm, &ip));
2218: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2219: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2220: PetscCall(ISRestoreIndices(rowperm, &ip));
2221: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2222: }
2224: // Create sptrsv handles for U and U^T
2225: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2226: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2227: #else
2228: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2229: #endif
2230: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2231: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2232: }
2233: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2234: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2235: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2237: // Copy the value
2238: PetscScalar *Ua = factors->aU_h.data();
2239: PetscScalar *D = factors->D_h.data();
2240: for (PetscInt i = 0; i < m; i++) {
2241: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2242: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2243: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2244: }
2245: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2246: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2248: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2249: factors->transpose_updated = PETSC_FALSE;
2250: }
2252: B->ops->matsolve = NULL;
2253: B->ops->matsolvetranspose = NULL;
2254: PetscFunctionReturn(PETSC_SUCCESS);
2255: }
2257: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2258: {
2259: PetscFunctionBegin;
2260: if (info->solveonhost) {
2261: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2262: PetscCall(MatSetType(B, MATSEQSBAIJ));
2263: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2264: }
2266: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2268: if (!info->solveonhost) {
2269: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2270: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2271: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2272: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2273: }
2274: PetscFunctionReturn(PETSC_SUCCESS);
2275: }
2277: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2278: {
2279: PetscFunctionBegin;
2280: if (info->solveonhost) {
2281: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2282: PetscCall(MatSetType(B, MATSEQSBAIJ));
2283: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2284: }
2286: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2288: if (!info->solveonhost) {
2289: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2290: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2291: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2292: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2293: }
2294: PetscFunctionReturn(PETSC_SUCCESS);
2295: }
2297: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2298: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2299: {
2300: PetscFunctionBegin;
2301: *type = MATSOLVERKOKKOS;
2302: PetscFunctionReturn(PETSC_SUCCESS);
2303: }
2305: /*MC
2306: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2307: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2309: Level: beginner
2311: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2312: M*/
2313: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2314: {
2315: PetscInt n = A->rmap->n;
2316: MPI_Comm comm;
2318: PetscFunctionBegin;
2319: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2320: PetscCall(MatCreate(comm, B));
2321: PetscCall(MatSetSizes(*B, n, n, n, n));
2322: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2323: (*B)->factortype = ftype;
2324: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2325: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2326: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2328: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2329: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2330: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2331: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2332: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2333: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2334: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2335: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2336: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2337: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2338: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2339: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2341: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2342: (*B)->canuseordering = PETSC_TRUE;
2343: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2344: PetscFunctionReturn(PETSC_SUCCESS);
2345: }
2347: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2348: {
2349: PetscFunctionBegin;
2350: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2352: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2353: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2354: PetscFunctionReturn(PETSC_SUCCESS);
2355: }
2357: /* Utility to print out a KokkosCsrMatrix for debugging */
2358: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2359: {
2360: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2361: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2362: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2363: const PetscInt *i = iv.data();
2364: const PetscInt *j = jv.data();
2365: const PetscScalar *a = av.data();
2366: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2368: PetscFunctionBegin;
2369: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2370: for (PetscInt k = 0; k < m; k++) {
2371: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2372: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2373: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2374: }
2375: PetscFunctionReturn(PETSC_SUCCESS);
2376: }