Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsys.h>
10: #include <Kokkos_Core.hpp>
11: #include <KokkosBlas.hpp>
12: #include <KokkosSparse_CrsMatrix.hpp>
14: // To suppress compiler warnings:
15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19: #define DISABLE_CUSPARSE_DEPRECATED
20: #include <KokkosSparse_spmv.hpp>
22: #include <KokkosSparse_spiluk.hpp>
23: #include <KokkosSparse_sptrsv.hpp>
24: #include <KokkosSparse_spgemm.hpp>
25: #include <KokkosSparse_spadd.hpp>
26: #include <KokkosBatched_LU_Decl.hpp>
27: #include <KokkosBatched_InverseLU_Decl.hpp>
29: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
31: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32: #include <KokkosSparse_Utils.hpp>
33: using KokkosSparse::sort_crs_matrix;
34: using KokkosSparse::Impl::transpose_matrix;
35: #else
36: #include <KokkosKernels_Sorting.hpp>
37: using KokkosKernels::sort_crs_matrix;
38: using KokkosKernels::Impl::transpose_matrix;
39: #endif
41: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42: using KokkosSparse::spiluk_symbolic;
43: using KokkosSparse::spiluk_numeric;
44: using KokkosSparse::sptrsv_symbolic;
45: using KokkosSparse::sptrsv_solve;
46: using KokkosSparse::Experimental::SPTRSVAlgorithm;
47: using KokkosSparse::Experimental::SPILUKAlgorithm;
48: #else
49: using KokkosSparse::Experimental::spiluk_symbolic;
50: using KokkosSparse::Experimental::spiluk_numeric;
51: using KokkosSparse::Experimental::sptrsv_symbolic;
52: using KokkosSparse::Experimental::sptrsv_solve;
53: using KokkosSparse::Experimental::SPTRSVAlgorithm;
54: using KokkosSparse::Experimental::SPILUKAlgorithm;
55: #endif
57: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
59: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61: In the latter case, it is important to set a_dual's sync state correctly.
62: */
63: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64: {
65: Mat_SeqAIJ *aijseq;
66: Mat_SeqAIJKokkos *aijkok;
68: PetscFunctionBegin;
69: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
72: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
75: /* If aijkok does not exist, we just copy i, j to device.
76: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77: In both cases, we build a new aijkok structure.
78: */
79: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80: if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81: PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84: PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85: PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86: PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87: aijseq->free_a = PETSC_TRUE;
88: aijseq->free_ij = PETSC_TRUE;
89: /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90: we have the CsrMatrix on device first and then copy to host, followed by
91: MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92: One could improve it by not using NULL garray.
93: */
94: }
95: delete aijkok;
96: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
97: A->spptr = aijkok;
98: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
100: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
101: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
102: }
103: PetscFunctionReturn(PETSC_SUCCESS);
104: }
106: /* Sync CSR data to device if not yet */
107: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
108: {
109: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
111: PetscFunctionBegin;
112: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
113: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
114: if (aijkok->a_dual.need_sync_device()) {
115: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
116: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
117: aijkok->hermitian_updated = PETSC_FALSE;
118: }
119: PetscFunctionReturn(PETSC_SUCCESS);
120: }
122: /* Mark the CSR data on device as modified */
123: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
124: {
125: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
127: PetscFunctionBegin;
128: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
129: aijkok->a_dual.clear_sync_state();
130: aijkok->a_dual.modify_device();
131: aijkok->transpose_updated = PETSC_FALSE;
132: aijkok->hermitian_updated = PETSC_FALSE;
133: PetscCall(MatSeqAIJInvalidateDiagonal(A));
134: PetscCall(PetscObjectStateIncrease((PetscObject)A));
135: PetscFunctionReturn(PETSC_SUCCESS);
136: }
138: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
139: {
140: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141: auto exec = PetscGetKokkosExecutionSpace();
143: PetscFunctionBegin;
144: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
145: /* We do not expect one needs factors on host */
146: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
147: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
148: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
149: PetscFunctionReturn(PETSC_SUCCESS);
150: }
152: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
153: {
154: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
156: PetscFunctionBegin;
157: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
158: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
159: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
160: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
161: */
162: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
164: *array = aijkok->a_dual.view_host().data();
165: } else { /* Happens when calling MatSetValues on a newly created matrix */
166: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
167: }
168: PetscFunctionReturn(PETSC_SUCCESS);
169: }
171: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
172: {
173: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
175: PetscFunctionBegin;
176: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
177: PetscFunctionReturn(PETSC_SUCCESS);
178: }
180: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
181: {
182: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
184: PetscFunctionBegin;
185: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
186: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
187: *array = aijkok->a_dual.view_host().data();
188: } else {
189: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
190: }
191: PetscFunctionReturn(PETSC_SUCCESS);
192: }
194: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
195: {
196: PetscFunctionBegin;
197: *array = NULL;
198: PetscFunctionReturn(PETSC_SUCCESS);
199: }
201: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
202: {
203: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
205: PetscFunctionBegin;
206: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
207: *array = aijkok->a_dual.view_host().data();
208: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
209: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
210: }
211: PetscFunctionReturn(PETSC_SUCCESS);
212: }
214: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
215: {
216: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
218: PetscFunctionBegin;
219: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
220: aijkok->a_dual.clear_sync_state();
221: aijkok->a_dual.modify_host();
222: }
223: PetscFunctionReturn(PETSC_SUCCESS);
224: }
226: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
227: {
228: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
230: PetscFunctionBegin;
231: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
233: if (i) *i = aijkok->i_device_data();
234: if (j) *j = aijkok->j_device_data();
235: if (a) {
236: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
237: *a = aijkok->a_device_data();
238: }
239: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
240: PetscFunctionReturn(PETSC_SUCCESS);
241: }
243: static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
244: {
245: PetscFunctionBegin;
246: *mtype = PETSC_MEMTYPE_KOKKOS;
247: PetscFunctionReturn(PETSC_SUCCESS);
248: }
250: /*
251: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
253: Input Parameter:
254: . A - the MATSEQAIJKOKKOS matrix
256: Output Parameters:
257: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
258: - T_d - the transpose on device, whose value array is allocated but not initialized
259: */
260: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
261: {
262: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
263: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
264: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
265: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
266: MatRowMapType *Ti = Ti_h.data();
267: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
268: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
269: PetscInt *Tj = Tj_h.data();
270: PetscInt *perm = perm_h.data();
271: PetscInt *offset;
273: PetscFunctionBegin;
274: // Populate Ti
275: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
276: Ti++;
277: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
278: Ti--;
279: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
281: // Populate Tj and the permutation array
282: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
283: for (PetscInt i = 0; i < m; i++) {
284: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
285: PetscInt r = Aj[j]; // row r of T
286: PetscInt disp = Ti[r] + offset[r];
288: Tj[disp] = i; // col i of T
289: perm[disp] = j;
290: offset[r]++;
291: }
292: }
293: PetscCall(PetscFree(offset));
295: // Sort each row of T, along with the permutation array
296: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
298: // Output perm and T on device
299: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
300: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
301: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
302: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
303: PetscFunctionReturn(PETSC_SUCCESS);
304: }
306: // Generate the transpose on device and cache it internally
307: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
308: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
309: {
310: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
311: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
312: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
313: KokkosCsrMatrix &T = akok->csrmatT;
315: PetscFunctionBegin;
316: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
317: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
319: const auto &Aa = akok->a_dual.view_device();
321: if (A->symmetric == PETSC_BOOL3_TRUE) {
322: *csrmatT = akok->csrmat;
323: } else {
324: // See if we already have a cached transpose and its value is up to date
325: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
326: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
327: const auto &perm = akok->transpose_perm; // get the permutation array
328: auto &Ta = T.values;
330: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
331: }
332: } else { // Generate T of size n x m for the first time
333: MatRowMapKokkosView perm;
335: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
336: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
337: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
338: }
339: akok->transpose_updated = PETSC_TRUE;
340: *csrmatT = akok->csrmatT;
341: }
342: PetscFunctionReturn(PETSC_SUCCESS);
343: }
345: // Generate the Hermitian on device and cache it internally
346: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
347: {
348: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
349: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
350: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
351: KokkosCsrMatrix &T = akok->csrmatH;
353: PetscFunctionBegin;
354: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
355: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
357: const auto &Aa = akok->a_dual.view_device();
359: if (A->hermitian == PETSC_BOOL3_TRUE) {
360: *csrmatH = akok->csrmat;
361: } else {
362: // See if we already have a cached hermitian and its value is up to date
363: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
364: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
365: const auto &perm = akok->transpose_perm; // get the permutation array
366: auto &Ta = T.values;
368: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
369: }
370: } else { // Generate T of size n x m for the first time
371: MatRowMapKokkosView perm;
373: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
374: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
375: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
376: }
377: akok->hermitian_updated = PETSC_TRUE;
378: *csrmatH = akok->csrmatH;
379: }
380: PetscFunctionReturn(PETSC_SUCCESS);
381: }
383: /* y = A x */
384: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
385: {
386: Mat_SeqAIJKokkos *aijkok;
387: ConstPetscScalarKokkosView xv;
388: PetscScalarKokkosView yv;
390: PetscFunctionBegin;
391: PetscCall(PetscLogGpuTimeBegin());
392: PetscCall(MatSeqAIJKokkosSyncDevice(A));
393: PetscCall(VecGetKokkosView(xx, &xv));
394: PetscCall(VecGetKokkosViewWrite(yy, &yv));
395: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
397: PetscCall(VecRestoreKokkosView(xx, &xv));
398: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
399: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
400: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
401: PetscCall(PetscLogGpuTimeEnd());
402: PetscFunctionReturn(PETSC_SUCCESS);
403: }
405: /* y = A^T x */
406: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
407: {
408: Mat_SeqAIJKokkos *aijkok;
409: const char *mode;
410: ConstPetscScalarKokkosView xv;
411: PetscScalarKokkosView yv;
412: KokkosCsrMatrix csrmat;
414: PetscFunctionBegin;
415: PetscCall(PetscLogGpuTimeBegin());
416: PetscCall(MatSeqAIJKokkosSyncDevice(A));
417: PetscCall(VecGetKokkosView(xx, &xv));
418: PetscCall(VecGetKokkosViewWrite(yy, &yv));
419: if (A->form_explicit_transpose) {
420: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
421: mode = "N";
422: } else {
423: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
424: csrmat = aijkok->csrmat;
425: mode = "T";
426: }
427: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
428: PetscCall(VecRestoreKokkosView(xx, &xv));
429: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
430: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
431: PetscCall(PetscLogGpuTimeEnd());
432: PetscFunctionReturn(PETSC_SUCCESS);
433: }
435: /* y = A^H x */
436: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
437: {
438: Mat_SeqAIJKokkos *aijkok;
439: const char *mode;
440: ConstPetscScalarKokkosView xv;
441: PetscScalarKokkosView yv;
442: KokkosCsrMatrix csrmat;
444: PetscFunctionBegin;
445: PetscCall(PetscLogGpuTimeBegin());
446: PetscCall(MatSeqAIJKokkosSyncDevice(A));
447: PetscCall(VecGetKokkosView(xx, &xv));
448: PetscCall(VecGetKokkosViewWrite(yy, &yv));
449: if (A->form_explicit_transpose) {
450: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
451: mode = "N";
452: } else {
453: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
454: csrmat = aijkok->csrmat;
455: mode = "C";
456: }
457: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
458: PetscCall(VecRestoreKokkosView(xx, &xv));
459: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
460: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
461: PetscCall(PetscLogGpuTimeEnd());
462: PetscFunctionReturn(PETSC_SUCCESS);
463: }
465: /* z = A x + y */
466: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
467: {
468: Mat_SeqAIJKokkos *aijkok;
469: ConstPetscScalarKokkosView xv;
470: PetscScalarKokkosView zv;
472: PetscFunctionBegin;
473: PetscCall(PetscLogGpuTimeBegin());
474: PetscCall(MatSeqAIJKokkosSyncDevice(A));
475: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
476: PetscCall(VecGetKokkosView(xx, &xv));
477: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
478: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
479: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
480: PetscCall(VecRestoreKokkosView(xx, &xv));
481: PetscCall(VecRestoreKokkosView(zz, &zv));
482: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
483: PetscCall(PetscLogGpuTimeEnd());
484: PetscFunctionReturn(PETSC_SUCCESS);
485: }
487: /* z = A^T x + y */
488: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
489: {
490: Mat_SeqAIJKokkos *aijkok;
491: const char *mode;
492: ConstPetscScalarKokkosView xv;
493: PetscScalarKokkosView zv;
494: KokkosCsrMatrix csrmat;
496: PetscFunctionBegin;
497: PetscCall(PetscLogGpuTimeBegin());
498: PetscCall(MatSeqAIJKokkosSyncDevice(A));
499: if (zz != yy) PetscCall(VecCopy(yy, zz));
500: PetscCall(VecGetKokkosView(xx, &xv));
501: PetscCall(VecGetKokkosView(zz, &zv));
502: if (A->form_explicit_transpose) {
503: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
504: mode = "N";
505: } else {
506: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
507: csrmat = aijkok->csrmat;
508: mode = "T";
509: }
510: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
511: PetscCall(VecRestoreKokkosView(xx, &xv));
512: PetscCall(VecRestoreKokkosView(zz, &zv));
513: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
514: PetscCall(PetscLogGpuTimeEnd());
515: PetscFunctionReturn(PETSC_SUCCESS);
516: }
518: /* z = A^H x + y */
519: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
520: {
521: Mat_SeqAIJKokkos *aijkok;
522: const char *mode;
523: ConstPetscScalarKokkosView xv;
524: PetscScalarKokkosView zv;
525: KokkosCsrMatrix csrmat;
527: PetscFunctionBegin;
528: PetscCall(PetscLogGpuTimeBegin());
529: PetscCall(MatSeqAIJKokkosSyncDevice(A));
530: if (zz != yy) PetscCall(VecCopy(yy, zz));
531: PetscCall(VecGetKokkosView(xx, &xv));
532: PetscCall(VecGetKokkosView(zz, &zv));
533: if (A->form_explicit_transpose) {
534: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
535: mode = "N";
536: } else {
537: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
538: csrmat = aijkok->csrmat;
539: mode = "C";
540: }
541: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
542: PetscCall(VecRestoreKokkosView(xx, &xv));
543: PetscCall(VecRestoreKokkosView(zz, &zv));
544: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
545: PetscCall(PetscLogGpuTimeEnd());
546: PetscFunctionReturn(PETSC_SUCCESS);
547: }
549: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
550: {
551: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
553: PetscFunctionBegin;
554: switch (op) {
555: case MAT_FORM_EXPLICIT_TRANSPOSE:
556: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
557: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
558: A->form_explicit_transpose = flg;
559: break;
560: default:
561: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
562: break;
563: }
564: PetscFunctionReturn(PETSC_SUCCESS);
565: }
567: /* Depending on reuse, either build a new mat, or use the existing mat */
568: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
569: {
570: Mat_SeqAIJ *aseq;
572: PetscFunctionBegin;
573: PetscCall(PetscKokkosInitializeCheck());
574: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
575: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
576: PetscCall(MatSetType(*newmat, mtype));
577: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
578: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
579: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
580: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
581: PetscCall(PetscFree(A->defaultvectype));
582: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
583: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
584: PetscCall(MatSetOps_SeqAIJKokkos(A));
585: aseq = static_cast<Mat_SeqAIJ *>(A->data);
586: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
587: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
588: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
589: }
590: }
591: PetscFunctionReturn(PETSC_SUCCESS);
592: }
594: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
595: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
596: */
597: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
598: {
599: Mat_SeqAIJ *bseq;
600: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
601: Mat mat;
603: PetscFunctionBegin;
604: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
605: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
606: mat = *B;
607: if (A->assembled) {
608: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
609: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
610: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
611: /* Now copy values to B if needed */
612: if (dupOption == MAT_COPY_VALUES) {
613: if (akok->a_dual.need_sync_device()) {
614: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
615: bkok->a_dual.modify_host();
616: } else { /* If device has the latest data, we only copy data on device */
617: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
618: bkok->a_dual.modify_device();
619: }
620: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
621: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
622: bkok->a_dual.modify_host();
623: }
624: mat->spptr = bkok;
625: }
627: PetscCall(PetscFree(mat->defaultvectype));
628: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
629: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
630: PetscCall(MatSetOps_SeqAIJKokkos(mat));
631: PetscFunctionReturn(PETSC_SUCCESS);
632: }
634: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
635: {
636: Mat At;
637: KokkosCsrMatrix internT;
638: Mat_SeqAIJKokkos *atkok, *bkok;
640: PetscFunctionBegin;
641: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
642: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
643: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
644: /* Deep copy internT, as we want to isolate the internal transpose */
645: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
646: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
647: if (reuse == MAT_INITIAL_MATRIX) *B = At;
648: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
649: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
650: if ((*B)->assembled) {
651: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
652: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
653: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
654: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
655: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
656: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
657: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
658: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
659: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
660: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
661: }
662: PetscFunctionReturn(PETSC_SUCCESS);
663: }
665: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
666: {
667: Mat_SeqAIJKokkos *aijkok;
669: PetscFunctionBegin;
670: if (A->factortype == MAT_FACTOR_NONE) {
671: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
672: delete aijkok;
673: } else {
674: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
675: }
676: A->spptr = NULL;
677: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
678: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
679: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
680: #if defined(PETSC_HAVE_HYPRE)
681: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
682: #endif
683: PetscCall(MatDestroy_SeqAIJ(A));
684: PetscFunctionReturn(PETSC_SUCCESS);
685: }
687: /*MC
688: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
690: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
692: Options Database Key:
693: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
695: Level: beginner
697: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
698: M*/
699: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
700: {
701: PetscFunctionBegin;
702: PetscCall(PetscKokkosInitializeCheck());
703: PetscCall(MatCreate_SeqAIJ(A));
704: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
705: PetscFunctionReturn(PETSC_SUCCESS);
706: }
708: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
709: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
710: {
711: Mat_SeqAIJ *a, *b;
712: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
713: MatScalarKokkosView aa, ba, ca;
714: MatRowMapKokkosView ai, bi, ci;
715: MatColIdxKokkosView aj, bj, cj;
716: PetscInt m, n, nnz, aN;
718: PetscFunctionBegin;
721: PetscAssertPointer(C, 4);
722: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
723: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
724: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
725: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
727: PetscCall(MatSeqAIJKokkosSyncDevice(A));
728: PetscCall(MatSeqAIJKokkosSyncDevice(B));
729: a = static_cast<Mat_SeqAIJ *>(A->data);
730: b = static_cast<Mat_SeqAIJ *>(B->data);
731: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
732: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
733: aa = akok->a_dual.view_device();
734: ai = akok->i_dual.view_device();
735: ba = bkok->a_dual.view_device();
736: bi = bkok->i_dual.view_device();
737: m = A->rmap->n; /* M, N and nnz of C */
738: n = A->cmap->n + B->cmap->n;
739: nnz = a->nz + b->nz;
740: aN = A->cmap->n; /* N of A */
741: if (reuse == MAT_INITIAL_MATRIX) {
742: aj = akok->j_dual.view_device();
743: bj = bkok->j_dual.view_device();
744: auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
745: auto ci = MatRowMapKokkosView("i", ai.extent(0));
746: auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
748: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
749: Kokkos::parallel_for(
750: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
751: PetscInt i = t.league_rank(); /* row i */
752: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
754: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
755: ci(i) = coffset;
756: if (i == m - 1) ci(m) = ai(m) + bi(m);
757: });
759: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
760: if (k < alen) {
761: ca(coffset + k) = aa(ai(i) + k);
762: cj(coffset + k) = aj(ai(i) + k);
763: } else {
764: ca(coffset + k) = ba(bi(i) + k - alen);
765: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
766: }
767: });
768: });
769: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
770: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771: } else if (reuse == MAT_REUSE_MATRIX) {
773: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775: ca = ckok->a_dual.view_device();
776: ci = ckok->i_dual.view_device();
778: Kokkos::parallel_for(
779: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780: PetscInt i = t.league_rank(); /* row i */
781: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784: else ca(ci(i) + k) = ba(bi(i) + k - alen);
785: });
786: });
787: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788: }
789: PetscFunctionReturn(PETSC_SUCCESS);
790: }
792: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793: {
794: PetscFunctionBegin;
795: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796: PetscFunctionReturn(PETSC_SUCCESS);
797: }
799: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800: {
801: Mat_Product *product = C->product;
802: Mat A, B;
803: bool transA, transB; /* use bool, since KK needs this type */
804: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
805: Mat_SeqAIJ *c;
806: MatProductData_SeqAIJKokkos *pdata;
807: KokkosCsrMatrix csrmatA, csrmatB;
809: PetscFunctionBegin;
810: MatCheckProduct(C, 1);
811: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
814: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816: // we still do numeric.
817: if (pdata->reusesym) { // numeric reuses results from symbolic
818: pdata->reusesym = PETSC_FALSE;
819: PetscFunctionReturn(PETSC_SUCCESS);
820: }
822: switch (product->type) {
823: case MATPRODUCT_AB:
824: transA = false;
825: transB = false;
826: break;
827: case MATPRODUCT_AtB:
828: transA = true;
829: transB = false;
830: break;
831: case MATPRODUCT_ABt:
832: transA = false;
833: transB = true;
834: break;
835: default:
836: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837: }
839: A = product->A;
840: B = product->B;
841: PetscCall(MatSeqAIJKokkosSyncDevice(A));
842: PetscCall(MatSeqAIJKokkosSyncDevice(B));
843: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
847: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
849: csrmatA = akok->csrmat;
850: csrmatB = bkok->csrmat;
852: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853: if (transA) {
854: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855: transA = false;
856: }
858: if (transB) {
859: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860: transB = false;
861: }
862: PetscCall(PetscLogGpuTimeBegin());
863: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865: auto spgemmHandle = pdata->kh.get_spgemm_handle();
866: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867: #endif
869: PetscCall(PetscLogGpuTimeEnd());
870: PetscCall(MatSeqAIJKokkosModifyDevice(C));
871: /* shorter version of MatAssemblyEnd_SeqAIJ */
872: c = (Mat_SeqAIJ *)C->data;
873: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876: c->reallocs = 0;
877: C->info.mallocs = 0;
878: C->info.nz_unneeded = 0;
879: C->assembled = C->was_assembled = PETSC_TRUE;
880: C->num_ass++;
881: PetscFunctionReturn(PETSC_SUCCESS);
882: }
884: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885: {
886: Mat_Product *product = C->product;
887: MatProductType ptype;
888: Mat A, B;
889: bool transA, transB;
890: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
891: MatProductData_SeqAIJKokkos *pdata;
892: MPI_Comm comm;
893: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
895: PetscFunctionBegin;
896: MatCheckProduct(C, 1);
897: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899: A = product->A;
900: B = product->B;
901: PetscCall(MatSeqAIJKokkosSyncDevice(A));
902: PetscCall(MatSeqAIJKokkosSyncDevice(B));
903: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905: csrmatA = akok->csrmat;
906: csrmatB = bkok->csrmat;
908: ptype = product->type;
909: // Take advantage of the symmetry if true
910: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911: ptype = MATPRODUCT_AB;
912: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913: }
914: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915: ptype = MATPRODUCT_AB;
916: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917: }
919: switch (ptype) {
920: case MATPRODUCT_AB:
921: transA = false;
922: transB = false;
923: PetscCall(MatSetBlockSizesFromMats(C, A, B));
924: break;
925: case MATPRODUCT_AtB:
926: transA = true;
927: transB = false;
928: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930: break;
931: case MATPRODUCT_ABt:
932: transA = false;
933: transB = true;
934: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936: break;
937: default:
938: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939: }
940: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941: pdata->reusesym = product->api_user;
943: /* TODO: add command line options to select spgemm algorithms */
944: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
946: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950: #endif
951: #endif
952: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
954: PetscCall(PetscLogGpuTimeBegin());
955: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956: if (transA) {
957: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958: transA = false;
959: }
961: if (transB) {
962: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963: transB = false;
964: }
966: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969: calling new Mat_SeqAIJKokkos().
970: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971: */
972: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975: auto spgemmHandle = pdata->kh.get_spgemm_handle();
976: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977: #endif
978: PetscCall(PetscLogGpuTimeEnd());
980: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983: PetscFunctionReturn(PETSC_SUCCESS);
984: }
986: /* handles sparse matrix matrix ops */
987: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988: {
989: Mat_Product *product = mat->product;
990: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
992: PetscFunctionBegin;
993: MatCheckProduct(mat, 1);
994: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996: if (Biskok && Ciskok) {
997: switch (product->type) {
998: case MATPRODUCT_AB:
999: case MATPRODUCT_AtB:
1000: case MATPRODUCT_ABt:
1001: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002: break;
1003: case MATPRODUCT_PtAP:
1004: case MATPRODUCT_RARt:
1005: case MATPRODUCT_ABC:
1006: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007: break;
1008: default:
1009: break;
1010: }
1011: } else { /* fallback for AIJ */
1012: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013: }
1014: PetscFunctionReturn(PETSC_SUCCESS);
1015: }
1017: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018: {
1019: Mat_SeqAIJKokkos *aijkok;
1021: PetscFunctionBegin;
1022: PetscCall(PetscLogGpuTimeBegin());
1023: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028: PetscCall(PetscLogGpuTimeEnd());
1029: PetscFunctionReturn(PETSC_SUCCESS);
1030: }
1032: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034: {
1035: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1037: PetscFunctionBegin;
1038: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1041: PetscCall(PetscLogGpuTimeBegin());
1042: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044: const auto &Aa = aijkok->a_dual.view_device();
1045: const auto &Adiag = aijkok->diag_dual.view_device();
1046: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048: PetscCall(PetscLogGpuFlops(n));
1049: PetscCall(PetscLogGpuTimeEnd());
1050: } else { // need reassembly, very slow!
1051: PetscCall(MatShift_Basic(A, a));
1052: }
1053: PetscFunctionReturn(PETSC_SUCCESS);
1054: }
1056: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057: {
1058: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1060: PetscFunctionBegin;
1061: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062: ConstPetscScalarKokkosView dv;
1063: PetscInt n, nv;
1065: PetscCall(PetscLogGpuTimeBegin());
1066: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067: PetscCall(VecGetKokkosView(D, &dv));
1068: PetscCall(VecGetLocalSize(D, &nv));
1069: n = PetscMin(Y->rmap->n, Y->cmap->n);
1070: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1072: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073: const auto &Aa = aijkok->a_dual.view_device();
1074: const auto &Adiag = aijkok->diag_dual.view_device();
1075: PetscCallCXX(Kokkos::parallel_for(
1076: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078: else Aa(Adiag(i)) += dv(i);
1079: }));
1080: PetscCall(VecRestoreKokkosView(D, &dv));
1081: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082: PetscCall(PetscLogGpuFlops(n));
1083: PetscCall(PetscLogGpuTimeEnd());
1084: } else { // need reassembly, very slow!
1085: PetscCall(MatDiagonalSet_Default(Y, D, is));
1086: }
1087: PetscFunctionReturn(PETSC_SUCCESS);
1088: }
1090: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091: {
1092: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094: ConstPetscScalarKokkosView lv, rv;
1096: PetscFunctionBegin;
1097: PetscCall(PetscLogGpuTimeBegin());
1098: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100: const auto &Aa = aijkok->a_dual.view_device();
1101: const auto &Ai = aijkok->i_dual.view_device();
1102: const auto &Aj = aijkok->j_dual.view_device();
1103: if (ll) {
1104: PetscCall(VecGetLocalSize(ll, &m));
1105: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106: PetscCall(VecGetKokkosView(ll, &lv));
1107: PetscCallCXX(Kokkos::parallel_for( // for each row
1108: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109: PetscInt i = t.league_rank(); // row i
1110: PetscInt len = Ai(i + 1) - Ai(i);
1111: // scale entries on the row
1112: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113: }));
1114: PetscCall(VecRestoreKokkosView(ll, &lv));
1115: PetscCall(PetscLogGpuFlops(nz));
1116: }
1117: if (rr) {
1118: PetscCall(VecGetLocalSize(rr, &n));
1119: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120: PetscCall(VecGetKokkosView(rr, &rv));
1121: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123: PetscCall(VecRestoreKokkosView(rr, &lv));
1124: PetscCall(PetscLogGpuFlops(nz));
1125: }
1126: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127: PetscCall(PetscLogGpuTimeEnd());
1128: PetscFunctionReturn(PETSC_SUCCESS);
1129: }
1131: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132: {
1133: Mat_SeqAIJKokkos *aijkok;
1135: PetscFunctionBegin;
1136: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137: if (aijkok) { /* Only zero the device if data is already there */
1138: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140: } else { /* Might be preallocated but not assembled */
1141: PetscCall(MatZeroEntries_SeqAIJ(A));
1142: }
1143: PetscFunctionReturn(PETSC_SUCCESS);
1144: }
1146: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147: {
1148: Mat_SeqAIJKokkos *aijkok;
1149: PetscInt n;
1150: PetscScalarKokkosView xv;
1152: PetscFunctionBegin;
1153: PetscCall(VecGetLocalSize(x, &n));
1154: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1157: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1160: const auto &Aa = aijkok->a_dual.view_device();
1161: const auto &Ai = aijkok->i_dual.view_device();
1162: const auto &Adiag = aijkok->diag_dual.view_device();
1164: PetscCall(VecGetKokkosViewWrite(x, &xv));
1165: Kokkos::parallel_for(
1166: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168: else xv(i) = 0;
1169: });
1170: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171: PetscFunctionReturn(PETSC_SUCCESS);
1172: }
1174: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176: {
1177: Mat_SeqAIJKokkos *aijkok;
1179: PetscFunctionBegin;
1181: PetscAssertPointer(kv, 2);
1182: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185: *kv = aijkok->a_dual.view_device();
1186: PetscFunctionReturn(PETSC_SUCCESS);
1187: }
1189: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190: {
1191: PetscFunctionBegin;
1193: PetscAssertPointer(kv, 2);
1194: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195: PetscFunctionReturn(PETSC_SUCCESS);
1196: }
1198: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199: {
1200: Mat_SeqAIJKokkos *aijkok;
1202: PetscFunctionBegin;
1204: PetscAssertPointer(kv, 2);
1205: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208: *kv = aijkok->a_dual.view_device();
1209: PetscFunctionReturn(PETSC_SUCCESS);
1210: }
1212: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213: {
1214: PetscFunctionBegin;
1216: PetscAssertPointer(kv, 2);
1217: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219: PetscFunctionReturn(PETSC_SUCCESS);
1220: }
1222: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223: {
1224: Mat_SeqAIJKokkos *aijkok;
1226: PetscFunctionBegin;
1228: PetscAssertPointer(kv, 2);
1229: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231: *kv = aijkok->a_dual.view_device();
1232: PetscFunctionReturn(PETSC_SUCCESS);
1233: }
1235: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236: {
1237: PetscFunctionBegin;
1239: PetscAssertPointer(kv, 2);
1240: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242: PetscFunctionReturn(PETSC_SUCCESS);
1243: }
1245: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246: {
1247: Mat_SeqAIJKokkos *akok;
1249: PetscFunctionBegin;
1250: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1251: PetscCall(MatCreate(comm, A));
1252: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1253: PetscFunctionReturn(PETSC_SUCCESS);
1254: }
1256: /* Computes Y += alpha X */
1257: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1258: {
1259: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1260: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1261: ConstMatScalarKokkosView Xa;
1262: MatScalarKokkosView Ya;
1263: auto exec = PetscGetKokkosExecutionSpace();
1265: PetscFunctionBegin;
1266: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1267: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1268: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1269: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1270: PetscCall(PetscLogGpuTimeBegin());
1272: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1273: PetscBool e;
1274: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1275: if (e) {
1276: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1277: if (e) pattern = SAME_NONZERO_PATTERN;
1278: }
1279: }
1281: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1282: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1283: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1284: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1285: */
1286: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1287: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1288: Xa = xkok->a_dual.view_device();
1289: Ya = ykok->a_dual.view_device();
1291: if (pattern == SAME_NONZERO_PATTERN) {
1292: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1293: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1294: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1295: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1296: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1298: Kokkos::parallel_for(
1299: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1300: PetscInt i = t.league_rank(); // row i
1301: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1302: // Only one thread works in a team
1303: PetscInt p, q = Yi(i);
1304: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1305: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1306: if (Xj(p) == Yj(q)) { // Found it
1307: Ya(q) += alpha * Xa(p);
1308: q++;
1309: } else {
1310: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1311: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1312: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1313: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1314: #else
1315: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1316: #endif
1317: }
1318: }
1319: });
1320: });
1321: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1322: } else { // different nonzero patterns
1323: Mat Z;
1324: KokkosCsrMatrix zcsr;
1325: KernelHandle kh;
1326: kh.create_spadd_handle(true); // X, Y are sorted
1327: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1328: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1329: zkok = new Mat_SeqAIJKokkos(zcsr);
1330: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1331: PetscCall(MatHeaderReplace(Y, &Z));
1332: kh.destroy_spadd_handle();
1333: }
1334: PetscCall(PetscLogGpuTimeEnd());
1335: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1336: PetscFunctionReturn(PETSC_SUCCESS);
1337: }
1339: struct MatCOOStruct_SeqAIJKokkos {
1340: PetscCount n;
1341: PetscCount Atot;
1342: PetscInt nz;
1343: PetscCountKokkosView jmap;
1344: PetscCountKokkosView perm;
1346: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1347: {
1348: nz = coo_h->nz;
1349: n = coo_h->n;
1350: Atot = coo_h->Atot;
1351: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1352: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1353: }
1354: };
1356: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1357: {
1358: PetscFunctionBegin;
1359: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1360: PetscFunctionReturn(PETSC_SUCCESS);
1361: }
1363: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1364: {
1365: Mat_SeqAIJKokkos *akok;
1366: Mat_SeqAIJ *aseq;
1367: PetscContainer container_h;
1368: MatCOOStruct_SeqAIJ *coo_h;
1369: MatCOOStruct_SeqAIJKokkos *coo_d;
1371: PetscFunctionBegin;
1372: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1373: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1374: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1375: delete akok;
1376: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1377: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1379: // Copy the COO struct to device
1380: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1381: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1382: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1384: // Put the COO struct in a container and then attach that to the matrix
1385: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1386: PetscFunctionReturn(PETSC_SUCCESS);
1387: }
1389: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1390: {
1391: MatScalarKokkosView Aa;
1392: ConstMatScalarKokkosView kv;
1393: PetscMemType memtype;
1394: PetscContainer container;
1395: MatCOOStruct_SeqAIJKokkos *coo;
1397: PetscFunctionBegin;
1398: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1399: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1401: const auto &n = coo->n;
1402: const auto &Annz = coo->nz;
1403: const auto &jmap = coo->jmap;
1404: const auto &perm = coo->perm;
1406: PetscCall(PetscGetMemType(v, &memtype));
1407: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1408: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1409: } else {
1410: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1411: }
1413: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1414: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1416: PetscCall(PetscLogGpuTimeBegin());
1417: Kokkos::parallel_for(
1418: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1419: PetscScalar sum = 0.0;
1420: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1421: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1422: });
1423: PetscCall(PetscLogGpuTimeEnd());
1425: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1426: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1427: PetscFunctionReturn(PETSC_SUCCESS);
1428: }
1430: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1431: {
1432: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1434: PetscFunctionBegin;
1435: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1436: A->boundtocpu = PETSC_FALSE;
1438: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1439: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1440: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1441: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1442: A->ops->scale = MatScale_SeqAIJKokkos;
1443: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1444: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1445: A->ops->mult = MatMult_SeqAIJKokkos;
1446: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1447: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1448: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1449: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1450: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1451: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1452: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1453: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1454: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1455: A->ops->shift = MatShift_SeqAIJKokkos;
1456: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1457: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1458: A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos;
1459: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1460: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1461: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1462: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1463: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1464: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1465: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1467: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1468: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1469: #if defined(PETSC_HAVE_HYPRE)
1470: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1471: #endif
1472: PetscFunctionReturn(PETSC_SUCCESS);
1473: }
1475: /*
1476: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1478: Input Parameters:
1479: + A - the MATSEQAIJKOKKOS matrix
1480: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1481: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1482: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1483: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1485: Output Parameter:
1486: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1487: */
1488: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1489: {
1490: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1491: PetscInt nblocks = bs.extent(0) - 1;
1493: PetscFunctionBegin;
1494: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1496: // Pull out the diagonal blocks of the matrix and then invert the blocks
1497: auto Aa = akok->a_dual.view_device();
1498: auto Ai = akok->i_dual.view_device();
1499: auto Aj = akok->j_dual.view_device();
1500: auto Adiag = akok->diag_dual.view_device();
1501: // TODO: how to tune the team size?
1502: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1503: auto ts = Kokkos::AUTO();
1504: #else
1505: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1506: #endif
1507: PetscCallCXX(Kokkos::parallel_for(
1508: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1509: const PetscInt bid = teamMember.league_rank(); // block id
1510: const PetscInt rstart = bs(bid); // this block starts from this row
1511: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1512: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1513: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1515: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1516: PetscInt i = rstart + r; // i-th row in A
1518: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1519: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1521: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1522: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1523: B(r, c) = 0.0;
1524: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1525: B(r, c) = Aa(first + c);
1526: } else { // this entry does not show up in the CSR
1527: B(r, c) = 0.0;
1528: }
1529: }
1530: } else { // rare case that the diagonal does not exist
1531: const PetscInt begin = Ai(i);
1532: const PetscInt end = Ai(i + 1);
1533: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1534: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1535: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1536: else if (Aj(j) >= rstart + m) break;
1537: }
1538: }
1539: });
1541: // LU-decompose B (w/o pivoting) and then invert B
1542: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1543: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1544: }));
1545: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1546: PetscFunctionReturn(PETSC_SUCCESS);
1547: }
1549: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1550: {
1551: Mat_SeqAIJ *aseq;
1552: PetscInt i, m, n;
1553: auto exec = PetscGetKokkosExecutionSpace();
1555: PetscFunctionBegin;
1556: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1558: m = akok->nrows();
1559: n = akok->ncols();
1560: PetscCall(MatSetSizes(A, m, n, m, n));
1561: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1563: /* Set up data structures of A as a MATSEQAIJ */
1564: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1565: aseq = (Mat_SeqAIJ *)A->data;
1567: PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1568: PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1570: aseq->i = akok->i_host_data();
1571: aseq->j = akok->j_host_data();
1572: aseq->a = akok->a_host_data();
1573: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1574: aseq->free_a = PETSC_FALSE;
1575: aseq->free_ij = PETSC_FALSE;
1576: aseq->nz = akok->nnz();
1577: aseq->maxnz = aseq->nz;
1579: PetscCall(PetscMalloc1(m, &aseq->imax));
1580: PetscCall(PetscMalloc1(m, &aseq->ilen));
1581: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1583: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1584: akok->nonzerostate = A->nonzerostate;
1585: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1586: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1587: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1588: PetscFunctionReturn(PETSC_SUCCESS);
1589: }
1591: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1592: {
1593: PetscFunctionBegin;
1594: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1596: PetscFunctionReturn(PETSC_SUCCESS);
1597: }
1599: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1600: {
1601: Mat_SeqAIJKokkos *akok;
1603: PetscFunctionBegin;
1604: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1605: PetscCall(MatCreate(comm, A));
1606: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1607: PetscFunctionReturn(PETSC_SUCCESS);
1608: }
1610: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1612: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1613: */
1614: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1615: {
1616: PetscFunctionBegin;
1617: PetscCall(MatCreate(comm, A));
1618: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619: PetscFunctionReturn(PETSC_SUCCESS);
1620: }
1622: /*@C
1623: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1624: (the default parallel PETSc format). This matrix will ultimately be handled by
1625: Kokkos for calculations.
1627: Collective
1629: Input Parameters:
1630: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1631: . m - number of rows
1632: . n - number of columns
1633: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1634: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1636: Output Parameter:
1637: . A - the matrix
1639: Level: intermediate
1641: Notes:
1642: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1643: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1646: The AIJ format, also called
1647: compressed row storage, is fully compatible with standard Fortran
1648: storage. That is, the stored row and column indices can begin at
1649: either one (as in Fortran) or zero.
1651: Specify the preallocated storage with either `nz` or `nnz` (not both).
1652: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1653: allocation.
1655: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1656: @*/
1657: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1658: {
1659: PetscFunctionBegin;
1660: PetscCall(PetscKokkosInitializeCheck());
1661: PetscCall(MatCreate(comm, A));
1662: PetscCall(MatSetSizes(*A, m, n, m, n));
1663: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1664: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1665: PetscFunctionReturn(PETSC_SUCCESS);
1666: }
1668: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1669: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1670: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1671: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1672: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1673: {
1674: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1676: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1678: PetscFunctionBegin;
1679: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1680: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1681: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1682: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1683: }
1684: PetscFunctionReturn(PETSC_SUCCESS);
1685: }
1687: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1688: {
1689: const PetscInt n = A->rmap->n;
1690: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1694: PetscFunctionBegin;
1695: if (!factors->transpose_updated) {
1696: if (has_upper) {
1697: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1698: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1699: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1700: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1701: }
1703: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1704: if (!factors->U) {
1705: Mat_SeqAIJ *seq;
1707: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1708: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1710: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1711: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1712: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1713: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1714: } else {
1715: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1716: }
1717: // Copy Ut from host to device
1718: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1719: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1720: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1721: } else { // If U was computed on device, we also compute the transpose there
1722: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1723: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1724: factors->jU_d, factors->aU_d,
1725: factors->iUt_d, factors->jUt_d,
1726: factors->aUt_d));
1727: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1728: }
1729: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730: }
1732: // do the same for L with LU
1733: if (has_lower) {
1734: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1735: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1736: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1737: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1738: }
1740: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1741: if (!factors->L) {
1742: Mat_SeqAIJ *seq;
1744: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1745: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1747: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1748: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1749: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1750: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1751: } else {
1752: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1753: }
1754: // Copy Lt from host to device
1755: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1756: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1757: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1758: } else { // If L was computed on device, we also compute the transpose there
1759: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1760: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1761: factors->jL_d, factors->aL_d,
1762: factors->iLt_d, factors->jLt_d,
1763: factors->aLt_d));
1764: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1765: }
1766: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767: }
1769: factors->transpose_updated = PETSC_TRUE;
1770: }
1771: PetscFunctionReturn(PETSC_SUCCESS);
1772: }
1774: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1775: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1776: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1777: {
1778: auto exec = PetscGetKokkosExecutionSpace();
1779: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1780: PetscInt m = A->rmap->n;
1781: PetscScalarKokkosView D = factors->D_d;
1782: PetscScalarKokkosView X, Y, B; // alias
1783: ConstPetscScalarKokkosView b;
1784: PetscScalarKokkosView x;
1785: PetscIntKokkosView &rowperm = factors->rowperm;
1786: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1788: PetscFunctionBegin;
1789: PetscCall(PetscLogGpuTimeBegin());
1790: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1791: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1792: PetscCall(VecGetKokkosView(bb, &b));
1793: PetscCall(VecGetKokkosViewWrite(xx, &x));
1795: // Solve U^T Y = B
1796: if (identity) { // Reorder b with the row permutation
1797: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1798: Y = factors->workVector;
1799: } else {
1800: B = factors->workVector;
1801: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1802: Y = x;
1803: }
1804: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1806: // Solve diag(D) Y' = Y.
1807: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1808: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1810: // Solve UX = Y
1811: if (identity) {
1812: X = x;
1813: } else {
1814: X = factors->workVector; // B is not needed anymore
1815: }
1816: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1818: // Reorder X with the inverse column (row) permutation
1819: if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1821: PetscCall(VecRestoreKokkosView(bb, &b));
1822: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1823: PetscCall(PetscLogGpuTimeEnd());
1824: PetscFunctionReturn(PETSC_SUCCESS);
1825: }
1827: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1828: // R and C are represented by rowperm and colperm in factors.
1829: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1830: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1831: {
1832: auto exec = PetscGetKokkosExecutionSpace();
1833: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1834: PetscInt m = A->rmap->n;
1835: PetscScalarKokkosView X, Y, B; // alias
1836: ConstPetscScalarKokkosView b;
1837: PetscScalarKokkosView x;
1838: PetscIntKokkosView &rowperm = factors->rowperm;
1839: PetscIntKokkosView &colperm = factors->colperm;
1840: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1841: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1843: PetscFunctionBegin;
1844: PetscCall(PetscLogGpuTimeBegin());
1845: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1846: PetscCall(VecGetKokkosView(bb, &b));
1847: PetscCall(VecGetKokkosViewWrite(xx, &x));
1849: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1850: if (row_identity) {
1851: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1852: Y = factors->workVector;
1853: } else {
1854: B = factors->workVector;
1855: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1856: Y = x;
1857: }
1858: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1860: // Solve U C^- x = Y
1861: if (col_identity) {
1862: X = x;
1863: } else {
1864: X = factors->workVector;
1865: }
1866: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1868: // x = C X; Reorder X with the inverse col permutation
1869: if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1871: PetscCall(VecRestoreKokkosView(bb, &b));
1872: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1873: PetscCall(PetscLogGpuTimeEnd());
1874: PetscFunctionReturn(PETSC_SUCCESS);
1875: }
1877: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1878: // R and C are represented by rowperm and colperm in factors.
1879: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1880: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1881: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1882: {
1883: auto exec = PetscGetKokkosExecutionSpace();
1884: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1885: PetscInt m = A->rmap->n;
1886: PetscScalarKokkosView X, Y, B; // alias
1887: ConstPetscScalarKokkosView b;
1888: PetscScalarKokkosView x;
1889: PetscIntKokkosView &rowperm = factors->rowperm;
1890: PetscIntKokkosView &colperm = factors->colperm;
1891: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1894: PetscFunctionBegin;
1895: PetscCall(PetscLogGpuTimeBegin());
1896: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1897: PetscCall(VecGetKokkosView(bb, &b));
1898: PetscCall(VecGetKokkosViewWrite(xx, &x));
1900: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1901: if (col_identity) { // Reorder b with the col permutation
1902: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1903: Y = factors->workVector;
1904: } else {
1905: B = factors->workVector;
1906: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1907: Y = x;
1908: }
1909: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1911: // Solve L^T X = Y
1912: if (row_identity) {
1913: X = x;
1914: } else {
1915: X = factors->workVector;
1916: }
1917: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1919: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1920: if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1922: PetscCall(VecRestoreKokkosView(bb, &b));
1923: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1924: PetscCall(PetscLogGpuTimeEnd());
1925: PetscFunctionReturn(PETSC_SUCCESS);
1926: }
1928: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1929: {
1930: PetscFunctionBegin;
1931: PetscCall(MatSeqAIJKokkosSyncHost(A));
1932: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1934: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1935: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1936: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1937: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1938: const MatScalar *Ba = b->a;
1939: PetscInt m = B->rmap->n, n = B->cmap->n;
1941: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1942: // Allocate memory and copy the structure
1943: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1944: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1945: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1946: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1947: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1948: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1950: PetscInt *Li = factors->iL_h.data();
1951: PetscInt *Lj = factors->jL_h.data();
1952: PetscInt *Ui = factors->iU_h.data();
1953: PetscInt *Uj = factors->jU_h.data();
1955: Li[0] = Ui[0] = 0;
1956: for (PetscInt i = 0; i < m; i++) {
1957: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1958: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1960: PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1961: Lj[Li[i] + llen] = i; // diagonal entry of L
1963: Uj[Ui[i]] = i; // diagonal entry of U
1964: PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
1966: Li[i + 1] = Li[i] + llen + 1;
1967: Ui[i + 1] = Ui[i] + ulen;
1968: }
1970: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1971: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1972: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1973: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1974: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1975: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1977: // Copy row/col permutation to device
1978: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1979: PetscBool row_identity;
1980: PetscCall(ISIdentity(rowperm, &row_identity));
1981: if (!row_identity) {
1982: const PetscInt *ip;
1984: PetscCall(ISGetIndices(rowperm, &ip));
1985: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1986: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1987: PetscCall(ISRestoreIndices(rowperm, &ip));
1988: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1989: }
1991: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
1992: PetscBool col_identity;
1993: PetscCall(ISIdentity(colperm, &col_identity));
1994: if (!col_identity) {
1995: const PetscInt *ip;
1997: PetscCall(ISGetIndices(colperm, &ip));
1998: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1999: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2000: PetscCall(ISRestoreIndices(colperm, &ip));
2001: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2002: }
2004: /* Create sptrsv handles for L, U and their transpose */
2005: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2006: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2007: #else
2008: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2009: #endif
2010: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2011: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2012: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2013: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2014: }
2016: // Copy the value
2017: for (PetscInt i = 0; i < m; i++) {
2018: PetscInt llen = Bi[i + 1] - Bi[i];
2019: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2020: const PetscInt *Li = factors->iL_h.data();
2021: const PetscInt *Ui = factors->iU_h.data();
2023: PetscScalar *La = factors->aL_h.data();
2024: PetscScalar *Ua = factors->aU_h.data();
2026: PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2027: La[Li[i] + llen] = 1.0; // diagonal entry
2029: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2030: PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2031: }
2033: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2034: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2035: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2036: factors->transpose_updated = PETSC_FALSE;
2037: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2039: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2040: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2041: }
2043: B->ops->matsolve = NULL;
2044: B->ops->matsolvetranspose = NULL;
2045: PetscFunctionReturn(PETSC_SUCCESS);
2046: }
2048: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2049: {
2050: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2051: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2052: PetscInt fill_lev = info->levels;
2054: PetscFunctionBegin;
2055: PetscCall(PetscLogGpuTimeBegin());
2056: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2057: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2059: auto a_d = aijkok->a_dual.view_device();
2060: auto i_d = aijkok->i_dual.view_device();
2061: auto j_d = aijkok->j_dual.view_device();
2063: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2065: B->assembled = PETSC_TRUE;
2066: B->preallocated = PETSC_TRUE;
2067: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2068: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2069: B->ops->matsolve = NULL;
2070: B->ops->matsolvetranspose = NULL;
2072: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2073: factors->transpose_updated = PETSC_FALSE;
2074: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2075: /* TODO: log flops, but how to know that? */
2076: PetscCall(PetscLogGpuTimeEnd());
2077: PetscFunctionReturn(PETSC_SUCCESS);
2078: }
2080: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2081: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2082: {
2083: Mat_SeqAIJKokkos *aijkok;
2084: Mat_SeqAIJ *b;
2085: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2086: PetscInt fill_lev = info->levels;
2087: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2088: PetscInt n = A->rmap->n;
2090: PetscFunctionBegin;
2091: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2092: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2094: /* Create a spiluk handle and then do symbolic factorization */
2095: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2096: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2098: auto spiluk_handle = factors->kh.get_spiluk_handle();
2100: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2101: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2102: Kokkos::realloc(factors->iU_d, n + 1);
2103: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2105: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2106: auto i_d = aijkok->i_dual.view_device();
2107: auto j_d = aijkok->j_dual.view_device();
2108: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2109: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2111: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2112: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2113: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2114: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2116: /* TODO: add options to select sptrsv algorithms */
2117: /* Create sptrsv handles for L, U and their transpose */
2118: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2119: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2120: #else
2121: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2122: #endif
2124: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2125: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2126: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2127: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2129: /* Fill fields of the factor matrix B */
2130: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2131: b = (Mat_SeqAIJ *)B->data;
2132: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2133: B->info.fill_ratio_given = info->fill;
2134: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2136: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2137: PetscFunctionReturn(PETSC_SUCCESS);
2138: }
2140: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2141: {
2142: PetscFunctionBegin;
2143: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2144: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2145: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2146: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2147: PetscFunctionReturn(PETSC_SUCCESS);
2148: }
2150: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151: {
2152: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2154: PetscFunctionBegin;
2155: if (!info->factoronhost) {
2156: PetscCall(ISIdentity(isrow, &row_identity));
2157: PetscCall(ISIdentity(iscol, &col_identity));
2158: }
2160: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2161: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2163: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2164: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2165: } else {
2166: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2167: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2168: }
2169: PetscFunctionReturn(PETSC_SUCCESS);
2170: }
2172: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2173: {
2174: PetscFunctionBegin;
2175: PetscCall(MatSeqAIJKokkosSyncHost(A));
2176: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2178: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2179: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2180: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2181: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2182: const MatScalar *Ba = b->a;
2183: PetscInt m = B->rmap->n;
2185: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2186: // Allocate memory and copy the structure
2187: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2188: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2189: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2190: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2191: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2192: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2194: // Build jU_h from the skewed Aj
2195: PetscInt *Uj = factors->jU_h.data();
2196: for (PetscInt i = 0; i < m; i++) {
2197: PetscInt ulen = Bi[i + 1] - Bi[i];
2198: Uj[Bi[i]] = i; // diagonal entry
2199: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2200: }
2202: // Copy iU, jU to device
2203: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2204: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2206: // Copy row/col permutation to device
2207: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2208: PetscBool row_identity;
2209: PetscCall(ISIdentity(rowperm, &row_identity));
2210: if (!row_identity) {
2211: const PetscInt *ip;
2213: PetscCall(ISGetIndices(rowperm, &ip));
2214: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2215: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2216: PetscCall(ISRestoreIndices(rowperm, &ip));
2217: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2218: }
2220: // Create sptrsv handles for U and U^T
2221: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2222: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2223: #else
2224: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2225: #endif
2226: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2227: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2228: }
2229: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2230: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2231: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2233: // Copy the value
2234: PetscScalar *Ua = factors->aU_h.data();
2235: PetscScalar *D = factors->D_h.data();
2236: for (PetscInt i = 0; i < m; i++) {
2237: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2238: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2239: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2240: }
2241: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2242: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2244: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2245: factors->transpose_updated = PETSC_FALSE;
2246: }
2248: B->ops->matsolve = NULL;
2249: B->ops->matsolvetranspose = NULL;
2250: PetscFunctionReturn(PETSC_SUCCESS);
2251: }
2253: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2254: {
2255: PetscFunctionBegin;
2256: if (info->solveonhost) {
2257: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2258: PetscCall(MatSetType(B, MATSEQSBAIJ));
2259: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2260: }
2262: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2264: if (!info->solveonhost) {
2265: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2266: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2267: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2268: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2269: }
2270: PetscFunctionReturn(PETSC_SUCCESS);
2271: }
2273: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2274: {
2275: PetscFunctionBegin;
2276: if (info->solveonhost) {
2277: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2278: PetscCall(MatSetType(B, MATSEQSBAIJ));
2279: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2280: }
2282: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2284: if (!info->solveonhost) {
2285: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2286: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2287: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2288: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2289: }
2290: PetscFunctionReturn(PETSC_SUCCESS);
2291: }
2293: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2294: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2295: {
2296: PetscFunctionBegin;
2297: *type = MATSOLVERKOKKOS;
2298: PetscFunctionReturn(PETSC_SUCCESS);
2299: }
2301: /*MC
2302: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2303: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2305: Level: beginner
2307: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2308: M*/
2309: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2310: {
2311: PetscInt n = A->rmap->n;
2312: MPI_Comm comm;
2314: PetscFunctionBegin;
2315: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2316: PetscCall(MatCreate(comm, B));
2317: PetscCall(MatSetSizes(*B, n, n, n, n));
2318: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2319: (*B)->factortype = ftype;
2320: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2321: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2322: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2324: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2325: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2326: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2327: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2328: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2329: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2330: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2331: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2332: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2333: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2334: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2335: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2337: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2338: (*B)->canuseordering = PETSC_TRUE;
2339: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2340: PetscFunctionReturn(PETSC_SUCCESS);
2341: }
2343: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2344: {
2345: PetscFunctionBegin;
2346: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2347: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2348: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2350: PetscFunctionReturn(PETSC_SUCCESS);
2351: }
2353: /* Utility to print out a KokkosCsrMatrix for debugging */
2354: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2355: {
2356: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2357: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2358: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2359: const PetscInt *i = iv.data();
2360: const PetscInt *j = jv.data();
2361: const PetscScalar *a = av.data();
2362: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2364: PetscFunctionBegin;
2365: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2366: for (PetscInt k = 0; k < m; k++) {
2367: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2368: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2369: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2370: }
2371: PetscFunctionReturn(PETSC_SUCCESS);
2372: }