Actual source code: aijkok.kokkos.cxx

  1: #include <petsc_kokkos.hpp>
  2: #include <petscvec_kokkos.hpp>
  3: #include <petscmat_kokkos.hpp>
  4: #include <petscpkg_version.h>
  5: #include <petsc/private/petscimpl.h>
  6: #include <petsc/private/sfimpl.h>
  7: #include <petsc/private/kokkosimpl.hpp>
  8: #include <petscsys.h>

 10: #include <Kokkos_Core.hpp>
 11: #include <KokkosBlas.hpp>
 12: #include <KokkosSparse_CrsMatrix.hpp>

 14: // To suppress compiler warnings:
 15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
 16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
 17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
 18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
 19: #define DISABLE_CUSPARSE_DEPRECATED
 20: #include <KokkosSparse_spmv.hpp>

 22: #include <KokkosSparse_spiluk.hpp>
 23: #include <KokkosSparse_sptrsv.hpp>
 24: #include <KokkosSparse_spgemm.hpp>
 25: #include <KokkosSparse_spadd.hpp>
 26: #include <KokkosBatched_LU_Decl.hpp>
 27: #include <KokkosBatched_InverseLU_Decl.hpp>

 29: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>

 31: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
 32:   #include <KokkosSparse_Utils.hpp>
 33: using KokkosSparse::sort_crs_matrix;
 34: using KokkosSparse::Impl::transpose_matrix;
 35: #else
 36:   #include <KokkosKernels_Sorting.hpp>
 37: using KokkosKernels::sort_crs_matrix;
 38: using KokkosKernels::Impl::transpose_matrix;
 39: #endif

 41: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
 42: using KokkosSparse::spiluk_symbolic;
 43: using KokkosSparse::spiluk_numeric;
 44: using KokkosSparse::sptrsv_symbolic;
 45: using KokkosSparse::sptrsv_solve;
 46: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 47: using KokkosSparse::Experimental::SPILUKAlgorithm;
 48: #else
 49: using KokkosSparse::Experimental::spiluk_symbolic;
 50: using KokkosSparse::Experimental::spiluk_numeric;
 51: using KokkosSparse::Experimental::sptrsv_symbolic;
 52: using KokkosSparse::Experimental::sptrsv_solve;
 53: using KokkosSparse::Experimental::SPTRSVAlgorithm;
 54: using KokkosSparse::Experimental::SPILUKAlgorithm;
 55: #endif

 57: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */

 59: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
 60:    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
 61:    In the latter case, it is important to set a_dual's sync state correctly.
 62:  */
 63: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
 64: {
 65:   Mat_SeqAIJ       *aijseq;
 66:   Mat_SeqAIJKokkos *aijkok;

 68:   PetscFunctionBegin;
 69:   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
 70:   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));

 72:   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
 73:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

 75:   /* If aijkok does not exist, we just copy i, j to device.
 76:      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
 77:      In both cases, we build a new aijkok structure.
 78:   */
 79:   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
 80:     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
 81:       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
 82:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
 83:       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
 84:       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
 85:       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
 86:       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
 87:       aijseq->free_a  = PETSC_TRUE;
 88:       aijseq->free_ij = PETSC_TRUE;
 89:       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
 90:          we have the CsrMatrix on device first and then copy to host, followed by
 91:          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
 92:          One could improve it by not using NULL garray.
 93:       */
 94:     }
 95:     delete aijkok;
 96:     aijkok   = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
 97:     A->spptr = aijkok;
 98:   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
 99:     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
100:     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
101:     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
102:   }
103:   PetscFunctionReturn(PETSC_SUCCESS);
104: }

106: /* Sync CSR data to device if not yet */
107: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
108: {
109:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

111:   PetscFunctionBegin;
112:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
113:   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
114:   if (aijkok->a_dual.need_sync_device()) {
115:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
116:     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
117:     aijkok->hermitian_updated = PETSC_FALSE;
118:   }
119:   PetscFunctionReturn(PETSC_SUCCESS);
120: }

122: /* Mark the CSR data on device as modified */
123: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
124: {
125:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

127:   PetscFunctionBegin;
128:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
129:   aijkok->a_dual.clear_sync_state();
130:   aijkok->a_dual.modify_device();
131:   aijkok->transpose_updated = PETSC_FALSE;
132:   aijkok->hermitian_updated = PETSC_FALSE;
133:   PetscCall(MatSeqAIJInvalidateDiagonal(A));
134:   PetscCall(PetscObjectStateIncrease((PetscObject)A));
135:   PetscFunctionReturn(PETSC_SUCCESS);
136: }

138: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
139: {
140:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
141:   auto              exec   = PetscGetKokkosExecutionSpace();

143:   PetscFunctionBegin;
144:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
145:   /* We do not expect one needs factors on host  */
146:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
147:   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
148:   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
149:   PetscFunctionReturn(PETSC_SUCCESS);
150: }

152: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
153: {
154:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

156:   PetscFunctionBegin;
157:   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
158:     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
159:     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
160:     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
161:   */
162:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
163:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
164:     *array = aijkok->a_dual.view_host().data();
165:   } else { /* Happens when calling MatSetValues on a newly created matrix */
166:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
167:   }
168:   PetscFunctionReturn(PETSC_SUCCESS);
169: }

171: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
172: {
173:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

175:   PetscFunctionBegin;
176:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
177:   PetscFunctionReturn(PETSC_SUCCESS);
178: }

180: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
181: {
182:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

184:   PetscFunctionBegin;
185:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
186:     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
187:     *array = aijkok->a_dual.view_host().data();
188:   } else {
189:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
190:   }
191:   PetscFunctionReturn(PETSC_SUCCESS);
192: }

194: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
195: {
196:   PetscFunctionBegin;
197:   *array = NULL;
198:   PetscFunctionReturn(PETSC_SUCCESS);
199: }

201: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
202: {
203:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

205:   PetscFunctionBegin;
206:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
207:     *array = aijkok->a_dual.view_host().data();
208:   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
209:     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
210:   }
211:   PetscFunctionReturn(PETSC_SUCCESS);
212: }

214: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
215: {
216:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

218:   PetscFunctionBegin;
219:   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
220:     aijkok->a_dual.clear_sync_state();
221:     aijkok->a_dual.modify_host();
222:   }
223:   PetscFunctionReturn(PETSC_SUCCESS);
224: }

226: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
227: {
228:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

230:   PetscFunctionBegin;
231:   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");

233:   if (i) *i = aijkok->i_device_data();
234:   if (j) *j = aijkok->j_device_data();
235:   if (a) {
236:     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
237:     *a = aijkok->a_device_data();
238:   }
239:   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
240:   PetscFunctionReturn(PETSC_SUCCESS);
241: }

243: static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
244: {
245:   PetscFunctionBegin;
246:   *mtype = PETSC_MEMTYPE_KOKKOS;
247:   PetscFunctionReturn(PETSC_SUCCESS);
248: }

250: /*
251:   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.

253:   Input Parameter:
254: .  A       - the MATSEQAIJKOKKOS matrix

256:   Output Parameters:
257: +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
258: -  T_d    - the transpose on device, whose value array is allocated but not initialized
259: */
260: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
261: {
262:   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
263:   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
264:   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
265:   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
266:   MatRowMapType          *Ti = Ti_h.data();
267:   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
268:   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
269:   PetscInt               *Tj   = Tj_h.data();
270:   PetscInt               *perm = perm_h.data();
271:   PetscInt               *offset;

273:   PetscFunctionBegin;
274:   // Populate Ti
275:   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
276:   Ti++;
277:   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
278:   Ti--;
279:   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];

281:   // Populate Tj and the permutation array
282:   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
283:   for (PetscInt i = 0; i < m; i++) {
284:     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
285:       PetscInt r    = Aj[j];                       // row r of T
286:       PetscInt disp = Ti[r] + offset[r];

288:       Tj[disp]   = i; // col i of T
289:       perm[disp] = j;
290:       offset[r]++;
291:     }
292:   }
293:   PetscCall(PetscFree(offset));

295:   // Sort each row of T, along with the permutation array
296:   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));

298:   // Output perm and T on device
299:   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
300:   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
301:   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
302:   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
303:   PetscFunctionReturn(PETSC_SUCCESS);
304: }

306: // Generate the transpose on device and cache it internally
307: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
308: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
309: {
310:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
311:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
312:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
313:   KokkosCsrMatrix  &T = akok->csrmatT;

315:   PetscFunctionBegin;
316:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
317:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

319:   const auto &Aa = akok->a_dual.view_device();

321:   if (A->symmetric == PETSC_BOOL3_TRUE) {
322:     *csrmatT = akok->csrmat;
323:   } else {
324:     // See if we already have a cached transpose and its value is up to date
325:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
326:       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
327:         const auto &perm = akok->transpose_perm; // get the permutation array
328:         auto       &Ta   = T.values;

330:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
331:       }
332:     } else { // Generate T of size n x m for the first time
333:       MatRowMapKokkosView perm;

335:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
336:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
337:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
338:     }
339:     akok->transpose_updated = PETSC_TRUE;
340:     *csrmatT                = akok->csrmatT;
341:   }
342:   PetscFunctionReturn(PETSC_SUCCESS);
343: }

345: // Generate the Hermitian on device and cache it internally
346: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
347: {
348:   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
349:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
350:   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
351:   KokkosCsrMatrix  &T = akok->csrmatH;

353:   PetscFunctionBegin;
354:   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
355:   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device

357:   const auto &Aa = akok->a_dual.view_device();

359:   if (A->hermitian == PETSC_BOOL3_TRUE) {
360:     *csrmatH = akok->csrmat;
361:   } else {
362:     // See if we already have a cached hermitian and its value is up to date
363:     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
364:       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
365:         const auto &perm = akok->transpose_perm; // get the permutation array
366:         auto       &Ta   = T.values;

368:         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
369:       }
370:     } else { // Generate T of size n x m for the first time
371:       MatRowMapKokkosView perm;

373:       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
374:       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
375:       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
376:     }
377:     akok->hermitian_updated = PETSC_TRUE;
378:     *csrmatH                = akok->csrmatH;
379:   }
380:   PetscFunctionReturn(PETSC_SUCCESS);
381: }

383: /* y = A x */
384: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
385: {
386:   Mat_SeqAIJKokkos          *aijkok;
387:   ConstPetscScalarKokkosView xv;
388:   PetscScalarKokkosView      yv;

390:   PetscFunctionBegin;
391:   PetscCall(PetscLogGpuTimeBegin());
392:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
393:   PetscCall(VecGetKokkosView(xx, &xv));
394:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
395:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
396:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
397:   PetscCall(VecRestoreKokkosView(xx, &xv));
398:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
399:   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
400:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
401:   PetscCall(PetscLogGpuTimeEnd());
402:   PetscFunctionReturn(PETSC_SUCCESS);
403: }

405: /* y = A^T x */
406: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
407: {
408:   Mat_SeqAIJKokkos          *aijkok;
409:   const char                *mode;
410:   ConstPetscScalarKokkosView xv;
411:   PetscScalarKokkosView      yv;
412:   KokkosCsrMatrix            csrmat;

414:   PetscFunctionBegin;
415:   PetscCall(PetscLogGpuTimeBegin());
416:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
417:   PetscCall(VecGetKokkosView(xx, &xv));
418:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
419:   if (A->form_explicit_transpose) {
420:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
421:     mode = "N";
422:   } else {
423:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
424:     csrmat = aijkok->csrmat;
425:     mode   = "T";
426:   }
427:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
428:   PetscCall(VecRestoreKokkosView(xx, &xv));
429:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
430:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
431:   PetscCall(PetscLogGpuTimeEnd());
432:   PetscFunctionReturn(PETSC_SUCCESS);
433: }

435: /* y = A^H x */
436: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
437: {
438:   Mat_SeqAIJKokkos          *aijkok;
439:   const char                *mode;
440:   ConstPetscScalarKokkosView xv;
441:   PetscScalarKokkosView      yv;
442:   KokkosCsrMatrix            csrmat;

444:   PetscFunctionBegin;
445:   PetscCall(PetscLogGpuTimeBegin());
446:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
447:   PetscCall(VecGetKokkosView(xx, &xv));
448:   PetscCall(VecGetKokkosViewWrite(yy, &yv));
449:   if (A->form_explicit_transpose) {
450:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
451:     mode = "N";
452:   } else {
453:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
454:     csrmat = aijkok->csrmat;
455:     mode   = "C";
456:   }
457:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
458:   PetscCall(VecRestoreKokkosView(xx, &xv));
459:   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
460:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
461:   PetscCall(PetscLogGpuTimeEnd());
462:   PetscFunctionReturn(PETSC_SUCCESS);
463: }

465: /* z = A x + y */
466: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
467: {
468:   Mat_SeqAIJKokkos          *aijkok;
469:   ConstPetscScalarKokkosView xv;
470:   PetscScalarKokkosView      zv;

472:   PetscFunctionBegin;
473:   PetscCall(PetscLogGpuTimeBegin());
474:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
475:   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
476:   PetscCall(VecGetKokkosView(xx, &xv));
477:   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
478:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
479:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
480:   PetscCall(VecRestoreKokkosView(xx, &xv));
481:   PetscCall(VecRestoreKokkosView(zz, &zv));
482:   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
483:   PetscCall(PetscLogGpuTimeEnd());
484:   PetscFunctionReturn(PETSC_SUCCESS);
485: }

487: /* z = A^T x + y */
488: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
489: {
490:   Mat_SeqAIJKokkos          *aijkok;
491:   const char                *mode;
492:   ConstPetscScalarKokkosView xv;
493:   PetscScalarKokkosView      zv;
494:   KokkosCsrMatrix            csrmat;

496:   PetscFunctionBegin;
497:   PetscCall(PetscLogGpuTimeBegin());
498:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
499:   if (zz != yy) PetscCall(VecCopy(yy, zz));
500:   PetscCall(VecGetKokkosView(xx, &xv));
501:   PetscCall(VecGetKokkosView(zz, &zv));
502:   if (A->form_explicit_transpose) {
503:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
504:     mode = "N";
505:   } else {
506:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
507:     csrmat = aijkok->csrmat;
508:     mode   = "T";
509:   }
510:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
511:   PetscCall(VecRestoreKokkosView(xx, &xv));
512:   PetscCall(VecRestoreKokkosView(zz, &zv));
513:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
514:   PetscCall(PetscLogGpuTimeEnd());
515:   PetscFunctionReturn(PETSC_SUCCESS);
516: }

518: /* z = A^H x + y */
519: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
520: {
521:   Mat_SeqAIJKokkos          *aijkok;
522:   const char                *mode;
523:   ConstPetscScalarKokkosView xv;
524:   PetscScalarKokkosView      zv;
525:   KokkosCsrMatrix            csrmat;

527:   PetscFunctionBegin;
528:   PetscCall(PetscLogGpuTimeBegin());
529:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
530:   if (zz != yy) PetscCall(VecCopy(yy, zz));
531:   PetscCall(VecGetKokkosView(xx, &xv));
532:   PetscCall(VecGetKokkosView(zz, &zv));
533:   if (A->form_explicit_transpose) {
534:     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
535:     mode = "N";
536:   } else {
537:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
538:     csrmat = aijkok->csrmat;
539:     mode   = "C";
540:   }
541:   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
542:   PetscCall(VecRestoreKokkosView(xx, &xv));
543:   PetscCall(VecRestoreKokkosView(zz, &zv));
544:   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
545:   PetscCall(PetscLogGpuTimeEnd());
546:   PetscFunctionReturn(PETSC_SUCCESS);
547: }

549: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
550: {
551:   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

553:   PetscFunctionBegin;
554:   switch (op) {
555:   case MAT_FORM_EXPLICIT_TRANSPOSE:
556:     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
557:     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
558:     A->form_explicit_transpose = flg;
559:     break;
560:   default:
561:     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
562:     break;
563:   }
564:   PetscFunctionReturn(PETSC_SUCCESS);
565: }

567: /* Depending on reuse, either build a new mat, or use the existing mat */
568: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
569: {
570:   Mat_SeqAIJ *aseq;

572:   PetscFunctionBegin;
573:   PetscCall(PetscKokkosInitializeCheck());
574:   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
575:     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
576:     PetscCall(MatSetType(*newmat, mtype));
577:   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
578:     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
579:   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
580:     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
581:     PetscCall(PetscFree(A->defaultvectype));
582:     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
583:     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
584:     PetscCall(MatSetOps_SeqAIJKokkos(A));
585:     aseq = static_cast<Mat_SeqAIJ *>(A->data);
586:     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
587:       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
588:       A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
589:     }
590:   }
591:   PetscFunctionReturn(PETSC_SUCCESS);
592: }

594: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
595:    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
596:  */
597: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
598: {
599:   Mat_SeqAIJ       *bseq;
600:   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
601:   Mat               mat;

603:   PetscFunctionBegin;
604:   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
605:   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
606:   mat = *B;
607:   if (A->assembled) {
608:     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
609:     bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
610:     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
611:     /* Now copy values to B if needed */
612:     if (dupOption == MAT_COPY_VALUES) {
613:       if (akok->a_dual.need_sync_device()) {
614:         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
615:         bkok->a_dual.modify_host();
616:       } else { /* If device has the latest data, we only copy data on device */
617:         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
618:         bkok->a_dual.modify_device();
619:       }
620:     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
621:       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
622:       bkok->a_dual.modify_host();
623:     }
624:     mat->spptr = bkok;
625:   }

627:   PetscCall(PetscFree(mat->defaultvectype));
628:   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
629:   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
630:   PetscCall(MatSetOps_SeqAIJKokkos(mat));
631:   PetscFunctionReturn(PETSC_SUCCESS);
632: }

634: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
635: {
636:   Mat               At;
637:   KokkosCsrMatrix   internT;
638:   Mat_SeqAIJKokkos *atkok, *bkok;

640:   PetscFunctionBegin;
641:   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
642:   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
643:   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
644:     /* Deep copy internT, as we want to isolate the internal transpose */
645:     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
646:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
647:     if (reuse == MAT_INITIAL_MATRIX) *B = At;
648:     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
649:   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
650:     if ((*B)->assembled) {
651:       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
652:       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
653:       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
654:     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
655:       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
656:       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
657:       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
658:       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
659:       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
660:     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
661:   }
662:   PetscFunctionReturn(PETSC_SUCCESS);
663: }

665: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
666: {
667:   Mat_SeqAIJKokkos *aijkok;

669:   PetscFunctionBegin;
670:   if (A->factortype == MAT_FACTOR_NONE) {
671:     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
672:     delete aijkok;
673:   } else {
674:     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
675:   }
676:   A->spptr = NULL;
677:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
678:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
679:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
680: #if defined(PETSC_HAVE_HYPRE)
681:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
682: #endif
683:   PetscCall(MatDestroy_SeqAIJ(A));
684:   PetscFunctionReturn(PETSC_SUCCESS);
685: }

687: /*MC
688:    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos

690:    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types

692:    Options Database Key:
693: .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`

695:   Level: beginner

697: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
698: M*/
699: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
700: {
701:   PetscFunctionBegin;
702:   PetscCall(PetscKokkosInitializeCheck());
703:   PetscCall(MatCreate_SeqAIJ(A));
704:   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
705:   PetscFunctionReturn(PETSC_SUCCESS);
706: }

708: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
709: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
710: {
711:   Mat_SeqAIJ         *a, *b;
712:   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
713:   MatScalarKokkosView aa, ba, ca;
714:   MatRowMapKokkosView ai, bi, ci;
715:   MatColIdxKokkosView aj, bj, cj;
716:   PetscInt            m, n, nnz, aN;

718:   PetscFunctionBegin;
721:   PetscAssertPointer(C, 4);
722:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
723:   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
724:   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
725:   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");

727:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
728:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
729:   a    = static_cast<Mat_SeqAIJ *>(A->data);
730:   b    = static_cast<Mat_SeqAIJ *>(B->data);
731:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
732:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
733:   aa   = akok->a_dual.view_device();
734:   ai   = akok->i_dual.view_device();
735:   ba   = bkok->a_dual.view_device();
736:   bi   = bkok->i_dual.view_device();
737:   m    = A->rmap->n; /* M, N and nnz of C */
738:   n    = A->cmap->n + B->cmap->n;
739:   nnz  = a->nz + b->nz;
740:   aN   = A->cmap->n; /* N of A */
741:   if (reuse == MAT_INITIAL_MATRIX) {
742:     aj      = akok->j_dual.view_device();
743:     bj      = bkok->j_dual.view_device();
744:     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
745:     auto ci = MatRowMapKokkosView("i", ai.extent(0));
746:     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));

748:     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
749:     Kokkos::parallel_for(
750:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
751:         PetscInt i       = t.league_rank(); /* row i */
752:         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);

754:         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
755:                                                    ci(i) = coffset;
756:                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
757:         });

759:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
760:           if (k < alen) {
761:             ca(coffset + k) = aa(ai(i) + k);
762:             cj(coffset + k) = aj(ai(i) + k);
763:           } else {
764:             ca(coffset + k) = ba(bi(i) + k - alen);
765:             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
766:           }
767:         });
768:       });
769:     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
770:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
771:   } else if (reuse == MAT_REUSE_MATRIX) {
773:     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
774:     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
775:     ca   = ckok->a_dual.view_device();
776:     ci   = ckok->i_dual.view_device();

778:     Kokkos::parallel_for(
779:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
780:         PetscInt i    = t.league_rank(); /* row i */
781:         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
782:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
783:           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
784:           else ca(ci(i) + k) = ba(bi(i) + k - alen);
785:         });
786:       });
787:     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
788:   }
789:   PetscFunctionReturn(PETSC_SUCCESS);
790: }

792: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
793: {
794:   PetscFunctionBegin;
795:   delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
796:   PetscFunctionReturn(PETSC_SUCCESS);
797: }

799: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
800: {
801:   Mat_Product                 *product = C->product;
802:   Mat                          A, B;
803:   bool                         transA, transB; /* use bool, since KK needs this type */
804:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
805:   Mat_SeqAIJ                  *c;
806:   MatProductData_SeqAIJKokkos *pdata;
807:   KokkosCsrMatrix              csrmatA, csrmatB;

809:   PetscFunctionBegin;
810:   MatCheckProduct(C, 1);
811:   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
812:   pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);

814:   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
815:   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
816:   // we still do numeric.
817:   if (pdata->reusesym) { // numeric reuses results from symbolic
818:     pdata->reusesym = PETSC_FALSE;
819:     PetscFunctionReturn(PETSC_SUCCESS);
820:   }

822:   switch (product->type) {
823:   case MATPRODUCT_AB:
824:     transA = false;
825:     transB = false;
826:     break;
827:   case MATPRODUCT_AtB:
828:     transA = true;
829:     transB = false;
830:     break;
831:   case MATPRODUCT_ABt:
832:     transA = false;
833:     transB = true;
834:     break;
835:   default:
836:     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
837:   }

839:   A = product->A;
840:   B = product->B;
841:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
842:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
843:   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
844:   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
845:   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);

847:   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");

849:   csrmatA = akok->csrmat;
850:   csrmatB = bkok->csrmat;

852:   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
853:   if (transA) {
854:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
855:     transA = false;
856:   }

858:   if (transB) {
859:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
860:     transB = false;
861:   }
862:   PetscCall(PetscLogGpuTimeBegin());
863:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
864: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
865:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
866:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
867: #endif

869:   PetscCall(PetscLogGpuTimeEnd());
870:   PetscCall(MatSeqAIJKokkosModifyDevice(C));
871:   /* shorter version of MatAssemblyEnd_SeqAIJ */
872:   c = (Mat_SeqAIJ *)C->data;
873:   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
874:   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
875:   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
876:   c->reallocs         = 0;
877:   C->info.mallocs     = 0;
878:   C->info.nz_unneeded = 0;
879:   C->assembled = C->was_assembled = PETSC_TRUE;
880:   C->num_ass++;
881:   PetscFunctionReturn(PETSC_SUCCESS);
882: }

884: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
885: {
886:   Mat_Product                 *product = C->product;
887:   MatProductType               ptype;
888:   Mat                          A, B;
889:   bool                         transA, transB;
890:   Mat_SeqAIJKokkos            *akok, *bkok, *ckok;
891:   MatProductData_SeqAIJKokkos *pdata;
892:   MPI_Comm                     comm;
893:   KokkosCsrMatrix              csrmatA, csrmatB, csrmatC;

895:   PetscFunctionBegin;
896:   MatCheckProduct(C, 1);
897:   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
898:   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
899:   A = product->A;
900:   B = product->B;
901:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
902:   PetscCall(MatSeqAIJKokkosSyncDevice(B));
903:   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
904:   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
905:   csrmatA = akok->csrmat;
906:   csrmatB = bkok->csrmat;

908:   ptype = product->type;
909:   // Take advantage of the symmetry if true
910:   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
911:     ptype                                          = MATPRODUCT_AB;
912:     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
913:   }
914:   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
915:     ptype                                          = MATPRODUCT_AB;
916:     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
917:   }

919:   switch (ptype) {
920:   case MATPRODUCT_AB:
921:     transA = false;
922:     transB = false;
923:     PetscCall(MatSetBlockSizesFromMats(C, A, B));
924:     break;
925:   case MATPRODUCT_AtB:
926:     transA = true;
927:     transB = false;
928:     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
929:     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
930:     break;
931:   case MATPRODUCT_ABt:
932:     transA = false;
933:     transB = true;
934:     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
935:     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
936:     break;
937:   default:
938:     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
939:   }
940:   PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
941:   pdata->reusesym = product->api_user;

943:   /* TODO: add command line options to select spgemm algorithms */
944:   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */

946:   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
947: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
948:   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
949:   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
950:   #endif
951: #endif
952:   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));

954:   PetscCall(PetscLogGpuTimeBegin());
955:   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
956:   if (transA) {
957:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
958:     transA = false;
959:   }

961:   if (transB) {
962:     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
963:     transB = false;
964:   }

966:   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967:   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
968:     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
969:     calling new Mat_SeqAIJKokkos().
970:     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
971:   */
972:   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
973: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
974:   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
975:   auto spgemmHandle = pdata->kh.get_spgemm_handle();
976:   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
977: #endif
978:   PetscCall(PetscLogGpuTimeEnd());

980:   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
981:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
982:   C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
983:   PetscFunctionReturn(PETSC_SUCCESS);
984: }

986: /* handles sparse matrix matrix ops */
987: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
988: {
989:   Mat_Product *product = mat->product;
990:   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;

992:   PetscFunctionBegin;
993:   MatCheckProduct(mat, 1);
994:   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
995:   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
996:   if (Biskok && Ciskok) {
997:     switch (product->type) {
998:     case MATPRODUCT_AB:
999:     case MATPRODUCT_AtB:
1000:     case MATPRODUCT_ABt:
1001:       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1002:       break;
1003:     case MATPRODUCT_PtAP:
1004:     case MATPRODUCT_RARt:
1005:     case MATPRODUCT_ABC:
1006:       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1007:       break;
1008:     default:
1009:       break;
1010:     }
1011:   } else { /* fallback for AIJ */
1012:     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1013:   }
1014:   PetscFunctionReturn(PETSC_SUCCESS);
1015: }

1017: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1018: {
1019:   Mat_SeqAIJKokkos *aijkok;

1021:   PetscFunctionBegin;
1022:   PetscCall(PetscLogGpuTimeBegin());
1023:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1024:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1025:   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1026:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1027:   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1028:   PetscCall(PetscLogGpuTimeEnd());
1029:   PetscFunctionReturn(PETSC_SUCCESS);
1030: }

1032: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1033: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1034: {
1035:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);

1037:   PetscFunctionBegin;
1038:   if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1039:     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);

1041:     PetscCall(PetscLogGpuTimeBegin());
1042:     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1043:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1044:     const auto &Aa     = aijkok->a_dual.view_device();
1045:     const auto &Adiag  = aijkok->diag_dual.view_device();
1046:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1047:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1048:     PetscCall(PetscLogGpuFlops(n));
1049:     PetscCall(PetscLogGpuTimeEnd());
1050:   } else { // need reassembly, very slow!
1051:     PetscCall(MatShift_Basic(A, a));
1052:   }
1053:   PetscFunctionReturn(PETSC_SUCCESS);
1054: }

1056: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1057: {
1058:   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);

1060:   PetscFunctionBegin;
1061:   if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1062:     ConstPetscScalarKokkosView dv;
1063:     PetscInt                   n, nv;

1065:     PetscCall(PetscLogGpuTimeBegin());
1066:     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1067:     PetscCall(VecGetKokkosView(D, &dv));
1068:     PetscCall(VecGetLocalSize(D, &nv));
1069:     n = PetscMin(Y->rmap->n, Y->cmap->n);
1070:     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");

1072:     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1073:     const auto &Aa     = aijkok->a_dual.view_device();
1074:     const auto &Adiag  = aijkok->diag_dual.view_device();
1075:     PetscCallCXX(Kokkos::parallel_for(
1076:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1077:         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1078:         else Aa(Adiag(i)) += dv(i);
1079:       }));
1080:     PetscCall(VecRestoreKokkosView(D, &dv));
1081:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1082:     PetscCall(PetscLogGpuFlops(n));
1083:     PetscCall(PetscLogGpuTimeEnd());
1084:   } else { // need reassembly, very slow!
1085:     PetscCall(MatDiagonalSet_Default(Y, D, is));
1086:   }
1087:   PetscFunctionReturn(PETSC_SUCCESS);
1088: }

1090: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1091: {
1092:   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1093:   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1094:   ConstPetscScalarKokkosView lv, rv;

1096:   PetscFunctionBegin;
1097:   PetscCall(PetscLogGpuTimeBegin());
1098:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1099:   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1100:   const auto &Aa     = aijkok->a_dual.view_device();
1101:   const auto &Ai     = aijkok->i_dual.view_device();
1102:   const auto &Aj     = aijkok->j_dual.view_device();
1103:   if (ll) {
1104:     PetscCall(VecGetLocalSize(ll, &m));
1105:     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1106:     PetscCall(VecGetKokkosView(ll, &lv));
1107:     PetscCallCXX(Kokkos::parallel_for( // for each row
1108:       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1109:         PetscInt i   = t.league_rank(); // row i
1110:         PetscInt len = Ai(i + 1) - Ai(i);
1111:         // scale entries on the row
1112:         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1113:       }));
1114:     PetscCall(VecRestoreKokkosView(ll, &lv));
1115:     PetscCall(PetscLogGpuFlops(nz));
1116:   }
1117:   if (rr) {
1118:     PetscCall(VecGetLocalSize(rr, &n));
1119:     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1120:     PetscCall(VecGetKokkosView(rr, &rv));
1121:     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1122:       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1123:     PetscCall(VecRestoreKokkosView(rr, &lv));
1124:     PetscCall(PetscLogGpuFlops(nz));
1125:   }
1126:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1127:   PetscCall(PetscLogGpuTimeEnd());
1128:   PetscFunctionReturn(PETSC_SUCCESS);
1129: }

1131: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1132: {
1133:   Mat_SeqAIJKokkos *aijkok;

1135:   PetscFunctionBegin;
1136:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1137:   if (aijkok) { /* Only zero the device if data is already there */
1138:     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1139:     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1140:   } else { /* Might be preallocated but not assembled */
1141:     PetscCall(MatZeroEntries_SeqAIJ(A));
1142:   }
1143:   PetscFunctionReturn(PETSC_SUCCESS);
1144: }

1146: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1147: {
1148:   Mat_SeqAIJKokkos     *aijkok;
1149:   PetscInt              n;
1150:   PetscScalarKokkosView xv;

1152:   PetscFunctionBegin;
1153:   PetscCall(VecGetLocalSize(x, &n));
1154:   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1155:   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");

1157:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1158:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);

1160:   const auto &Aa    = aijkok->a_dual.view_device();
1161:   const auto &Ai    = aijkok->i_dual.view_device();
1162:   const auto &Adiag = aijkok->diag_dual.view_device();

1164:   PetscCall(VecGetKokkosViewWrite(x, &xv));
1165:   Kokkos::parallel_for(
1166:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1167:       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1168:       else xv(i) = 0;
1169:     });
1170:   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1171:   PetscFunctionReturn(PETSC_SUCCESS);
1172: }

1174: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1175: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1176: {
1177:   Mat_SeqAIJKokkos *aijkok;

1179:   PetscFunctionBegin;
1181:   PetscAssertPointer(kv, 2);
1182:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1183:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1184:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1185:   *kv    = aijkok->a_dual.view_device();
1186:   PetscFunctionReturn(PETSC_SUCCESS);
1187: }

1189: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1190: {
1191:   PetscFunctionBegin;
1193:   PetscAssertPointer(kv, 2);
1194:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195:   PetscFunctionReturn(PETSC_SUCCESS);
1196: }

1198: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1199: {
1200:   Mat_SeqAIJKokkos *aijkok;

1202:   PetscFunctionBegin;
1204:   PetscAssertPointer(kv, 2);
1205:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1206:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1207:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1208:   *kv    = aijkok->a_dual.view_device();
1209:   PetscFunctionReturn(PETSC_SUCCESS);
1210: }

1212: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1213: {
1214:   PetscFunctionBegin;
1216:   PetscAssertPointer(kv, 2);
1217:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1218:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1219:   PetscFunctionReturn(PETSC_SUCCESS);
1220: }

1222: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1223: {
1224:   Mat_SeqAIJKokkos *aijkok;

1226:   PetscFunctionBegin;
1228:   PetscAssertPointer(kv, 2);
1229:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230:   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1231:   *kv    = aijkok->a_dual.view_device();
1232:   PetscFunctionReturn(PETSC_SUCCESS);
1233: }

1235: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1236: {
1237:   PetscFunctionBegin;
1239:   PetscAssertPointer(kv, 2);
1240:   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1241:   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1242:   PetscFunctionReturn(PETSC_SUCCESS);
1243: }

1245: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1246: {
1247:   Mat_SeqAIJKokkos *akok;

1249:   PetscFunctionBegin;
1250:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1251:   PetscCall(MatCreate(comm, A));
1252:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1253:   PetscFunctionReturn(PETSC_SUCCESS);
1254: }

1256: /* Computes Y += alpha X */
1257: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1258: {
1259:   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1260:   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1261:   ConstMatScalarKokkosView Xa;
1262:   MatScalarKokkosView      Ya;
1263:   auto                     exec = PetscGetKokkosExecutionSpace();

1265:   PetscFunctionBegin;
1266:   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1267:   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1268:   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1269:   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1270:   PetscCall(PetscLogGpuTimeBegin());

1272:   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1273:     PetscBool e;
1274:     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1275:     if (e) {
1276:       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1277:       if (e) pattern = SAME_NONZERO_PATTERN;
1278:     }
1279:   }

1281:   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1282:     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1283:     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1284:     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1285:   */
1286:   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1287:   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1288:   Xa   = xkok->a_dual.view_device();
1289:   Ya   = ykok->a_dual.view_device();

1291:   if (pattern == SAME_NONZERO_PATTERN) {
1292:     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1293:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1294:   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1295:     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1296:     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();

1298:     Kokkos::parallel_for(
1299:       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1300:         PetscInt i = t.league_rank(); // row i
1301:         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1302:           // Only one thread works in a team
1303:           PetscInt p, q = Yi(i);
1304:           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1305:             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1306:             if (Xj(p) == Yj(q)) {                        // Found it
1307:               Ya(q) += alpha * Xa(p);
1308:               q++;
1309:             } else {
1310:             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1311:             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1312: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1313:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1314: #else
1315:               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1316: #endif
1317:             }
1318:           }
1319:         });
1320:       });
1321:     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1322:   } else { // different nonzero patterns
1323:     Mat             Z;
1324:     KokkosCsrMatrix zcsr;
1325:     KernelHandle    kh;
1326:     kh.create_spadd_handle(true); // X, Y are sorted
1327:     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1328:     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1329:     zkok = new Mat_SeqAIJKokkos(zcsr);
1330:     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1331:     PetscCall(MatHeaderReplace(Y, &Z));
1332:     kh.destroy_spadd_handle();
1333:   }
1334:   PetscCall(PetscLogGpuTimeEnd());
1335:   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1336:   PetscFunctionReturn(PETSC_SUCCESS);
1337: }

1339: struct MatCOOStruct_SeqAIJKokkos {
1340:   PetscCount           n;
1341:   PetscCount           Atot;
1342:   PetscInt             nz;
1343:   PetscCountKokkosView jmap;
1344:   PetscCountKokkosView perm;

1346:   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1347:   {
1348:     nz   = coo_h->nz;
1349:     n    = coo_h->n;
1350:     Atot = coo_h->Atot;
1351:     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1352:     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1353:   }
1354: };

1356: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1357: {
1358:   PetscFunctionBegin;
1359:   PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1360:   PetscFunctionReturn(PETSC_SUCCESS);
1361: }

1363: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1364: {
1365:   Mat_SeqAIJKokkos          *akok;
1366:   Mat_SeqAIJ                *aseq;
1367:   PetscContainer             container_h;
1368:   MatCOOStruct_SeqAIJ       *coo_h;
1369:   MatCOOStruct_SeqAIJKokkos *coo_d;

1371:   PetscFunctionBegin;
1372:   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1373:   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1374:   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1375:   delete akok;
1376:   mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1377:   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));

1379:   // Copy the COO struct to device
1380:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1381:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1382:   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));

1384:   // Put the COO struct in a container and then attach that to the matrix
1385:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1386:   PetscFunctionReturn(PETSC_SUCCESS);
1387: }

1389: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1390: {
1391:   MatScalarKokkosView        Aa;
1392:   ConstMatScalarKokkosView   kv;
1393:   PetscMemType               memtype;
1394:   PetscContainer             container;
1395:   MatCOOStruct_SeqAIJKokkos *coo;

1397:   PetscFunctionBegin;
1398:   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1399:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

1401:   const auto &n    = coo->n;
1402:   const auto &Annz = coo->nz;
1403:   const auto &jmap = coo->jmap;
1404:   const auto &perm = coo->perm;

1406:   PetscCall(PetscGetMemType(v, &memtype));
1407:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1408:     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1409:   } else {
1410:     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1411:   }

1413:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1414:   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */

1416:   PetscCall(PetscLogGpuTimeBegin());
1417:   Kokkos::parallel_for(
1418:     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1419:       PetscScalar sum = 0.0;
1420:       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1421:       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1422:     });
1423:   PetscCall(PetscLogGpuTimeEnd());

1425:   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1426:   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1427:   PetscFunctionReturn(PETSC_SUCCESS);
1428: }

1430: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1431: {
1432:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;

1434:   PetscFunctionBegin;
1435:   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1436:   A->boundtocpu  = PETSC_FALSE;

1438:   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1439:   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1440:   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1441:   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1442:   A->ops->scale                     = MatScale_SeqAIJKokkos;
1443:   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1444:   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1445:   A->ops->mult                      = MatMult_SeqAIJKokkos;
1446:   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1447:   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1448:   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1449:   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1450:   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1451:   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1452:   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1453:   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1454:   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1455:   A->ops->shift                     = MatShift_SeqAIJKokkos;
1456:   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1457:   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1458:   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1459:   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1460:   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1461:   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1462:   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1463:   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1464:   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1465:   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;

1467:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1468:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1469: #if defined(PETSC_HAVE_HYPRE)
1470:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1471: #endif
1472:   PetscFunctionReturn(PETSC_SUCCESS);
1473: }

1475: /*
1476:    Extract the (prescribled) diagonal blocks of the matrix and then invert them

1478:   Input Parameters:
1479: +  A       - the MATSEQAIJKOKKOS matrix
1480: .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1481: .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1482: .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1483: -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine

1485:   Output Parameter:
1486: .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1487: */
1488: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1489: {
1490:   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1491:   PetscInt          nblocks = bs.extent(0) - 1;

1493:   PetscFunctionBegin;
1494:   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device

1496:   // Pull out the diagonal blocks of the matrix and then invert the blocks
1497:   auto Aa    = akok->a_dual.view_device();
1498:   auto Ai    = akok->i_dual.view_device();
1499:   auto Aj    = akok->j_dual.view_device();
1500:   auto Adiag = akok->diag_dual.view_device();
1501:   // TODO: how to tune the team size?
1502: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1503:   auto ts = Kokkos::AUTO();
1504: #else
1505:   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1506: #endif
1507:   PetscCallCXX(Kokkos::parallel_for(
1508:     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1509:       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1510:       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1511:       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1512:       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1513:       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);

1515:       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1516:         PetscInt i = rstart + r;                                                            // i-th row in A

1518:         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1519:           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row

1521:           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1522:             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1523:               B(r, c) = 0.0;
1524:             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1525:               B(r, c) = Aa(first + c);
1526:             } else { // this entry does not show up in the CSR
1527:               B(r, c) = 0.0;
1528:             }
1529:           }
1530:         } else { // rare case that the diagonal does not exist
1531:           const PetscInt begin = Ai(i);
1532:           const PetscInt end   = Ai(i + 1);
1533:           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1534:           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1535:             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1536:             else if (Aj(j) >= rstart + m) break;
1537:           }
1538:         }
1539:       });

1541:       // LU-decompose B (w/o pivoting) and then invert B
1542:       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1543:       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1544:     }));
1545:   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1546:   PetscFunctionReturn(PETSC_SUCCESS);
1547: }

1549: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1550: {
1551:   Mat_SeqAIJ *aseq;
1552:   PetscInt    i, m, n;
1553:   auto        exec = PetscGetKokkosExecutionSpace();

1555:   PetscFunctionBegin;
1556:   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");

1558:   m = akok->nrows();
1559:   n = akok->ncols();
1560:   PetscCall(MatSetSizes(A, m, n, m, n));
1561:   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));

1563:   /* Set up data structures of A as a MATSEQAIJ */
1564:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1565:   aseq = (Mat_SeqAIJ *)A->data;

1567:   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1568:   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));

1570:   aseq->i       = akok->i_host_data();
1571:   aseq->j       = akok->j_host_data();
1572:   aseq->a       = akok->a_host_data();
1573:   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1574:   aseq->free_a  = PETSC_FALSE;
1575:   aseq->free_ij = PETSC_FALSE;
1576:   aseq->nz      = akok->nnz();
1577:   aseq->maxnz   = aseq->nz;

1579:   PetscCall(PetscMalloc1(m, &aseq->imax));
1580:   PetscCall(PetscMalloc1(m, &aseq->ilen));
1581:   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];

1583:   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1584:   akok->nonzerostate = A->nonzerostate;
1585:   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1586:   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1587:   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1588:   PetscFunctionReturn(PETSC_SUCCESS);
1589: }

1591: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1592: {
1593:   PetscFunctionBegin;
1594:   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1595:   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1596:   PetscFunctionReturn(PETSC_SUCCESS);
1597: }

1599: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1600: {
1601:   Mat_SeqAIJKokkos *akok;

1603:   PetscFunctionBegin;
1604:   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1605:   PetscCall(MatCreate(comm, A));
1606:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1607:   PetscFunctionReturn(PETSC_SUCCESS);
1608: }

1610: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure

1612:    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1613:  */
1614: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1615: {
1616:   PetscFunctionBegin;
1617:   PetscCall(MatCreate(comm, A));
1618:   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1619:   PetscFunctionReturn(PETSC_SUCCESS);
1620: }

1622: /*@C
1623:   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1624:   (the default parallel PETSc format). This matrix will ultimately be handled by
1625:   Kokkos for calculations.

1627:   Collective

1629:   Input Parameters:
1630: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1631: . m    - number of rows
1632: . n    - number of columns
1633: . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1634: - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`

1636:   Output Parameter:
1637: . A - the matrix

1639:   Level: intermediate

1641:   Notes:
1642:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1643:   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

1646:   The AIJ format, also called
1647:   compressed row storage, is fully compatible with standard Fortran
1648:   storage.  That is, the stored row and column indices can begin at
1649:   either one (as in Fortran) or zero.

1651:   Specify the preallocated storage with either `nz` or `nnz` (not both).
1652:   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1653:   allocation.

1655: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1656: @*/
1657: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1658: {
1659:   PetscFunctionBegin;
1660:   PetscCall(PetscKokkosInitializeCheck());
1661:   PetscCall(MatCreate(comm, A));
1662:   PetscCall(MatSetSizes(*A, m, n, m, n));
1663:   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1664:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1665:   PetscFunctionReturn(PETSC_SUCCESS);
1666: }

1668: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1669: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1670: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1671: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1672: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1673: {
1674:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1675:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1676:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy

1678:   PetscFunctionBegin;
1679:   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1680:     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1681:     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1682:     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1683:   }
1684:   PetscFunctionReturn(PETSC_SUCCESS);
1685: }

1687: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1688: {
1689:   const PetscInt              n         = A->rmap->n;
1690:   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1691:   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1692:   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy

1694:   PetscFunctionBegin;
1695:   if (!factors->transpose_updated) {
1696:     if (has_upper) {
1697:       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1698:         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1699:         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1700:         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1701:       }

1703:       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1704:         if (!factors->U) {
1705:           Mat_SeqAIJ *seq;

1707:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1708:           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));

1710:           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1711:           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1712:           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1713:           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1714:         } else {
1715:           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1716:         }
1717:         // Copy Ut from host to device
1718:         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1719:         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1720:         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1721:       } else { // If U was computed on device, we also compute the transpose there
1722:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1723:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1724:                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1725:                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1726:                                                                                                                                                                                                                                factors->aUt_d));
1727:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1728:       }
1729:       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730:     }

1732:     // do the same for L with LU
1733:     if (has_lower) {
1734:       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1735:         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1736:         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1737:         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1738:       }

1740:       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1741:         if (!factors->L) {
1742:           Mat_SeqAIJ *seq;

1744:           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1745:           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));

1747:           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1748:           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1749:           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1750:           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1751:         } else {
1752:           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1753:         }
1754:         // Copy Lt from host to device
1755:         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1756:         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1757:         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1758:       } else { // If L was computed on device, we also compute the transpose there
1759:         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1760:         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1761:                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1762:                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1763:                                                                                                                                                                                                                                factors->aLt_d));
1764:         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1765:       }
1766:       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767:     }

1769:     factors->transpose_updated = PETSC_TRUE;
1770:   }
1771:   PetscFunctionReturn(PETSC_SUCCESS);
1772: }

1774: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1775: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1776: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1777: {
1778:   auto                        exec    = PetscGetKokkosExecutionSpace();
1779:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1780:   PetscInt                    m       = A->rmap->n;
1781:   PetscScalarKokkosView       D       = factors->D_d;
1782:   PetscScalarKokkosView       X, Y, B; // alias
1783:   ConstPetscScalarKokkosView  b;
1784:   PetscScalarKokkosView       x;
1785:   PetscIntKokkosView         &rowperm  = factors->rowperm;
1786:   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1788:   PetscFunctionBegin;
1789:   PetscCall(PetscLogGpuTimeBegin());
1790:   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1791:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1792:   PetscCall(VecGetKokkosView(bb, &b));
1793:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1795:   // Solve U^T Y = B
1796:   if (identity) { // Reorder b with the row permutation
1797:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1798:     Y = factors->workVector;
1799:   } else {
1800:     B = factors->workVector;
1801:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1802:     Y = x;
1803:   }
1804:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1806:   // Solve diag(D) Y' = Y.
1807:   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1808:   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));

1810:   // Solve UX = Y
1811:   if (identity) {
1812:     X = x;
1813:   } else {
1814:     X = factors->workVector; // B is not needed anymore
1815:   }
1816:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1818:   // Reorder X with the inverse column (row) permutation
1819:   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));

1821:   PetscCall(VecRestoreKokkosView(bb, &b));
1822:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1823:   PetscCall(PetscLogGpuTimeEnd());
1824:   PetscFunctionReturn(PETSC_SUCCESS);
1825: }

1827: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1828: // R and C are represented by rowperm and colperm in factors.
1829: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1830: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1831: {
1832:   auto                        exec    = PetscGetKokkosExecutionSpace();
1833:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1834:   PetscInt                    m       = A->rmap->n;
1835:   PetscScalarKokkosView       X, Y, B; // alias
1836:   ConstPetscScalarKokkosView  b;
1837:   PetscScalarKokkosView       x;
1838:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1839:   PetscIntKokkosView         &colperm      = factors->colperm;
1840:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1841:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1843:   PetscFunctionBegin;
1844:   PetscCall(PetscLogGpuTimeBegin());
1845:   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1846:   PetscCall(VecGetKokkosView(bb, &b));
1847:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1849:   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1850:   if (row_identity) {
1851:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1852:     Y = factors->workVector;
1853:   } else {
1854:     B = factors->workVector;
1855:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1856:     Y = x;
1857:   }
1858:   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));

1860:   // Solve U C^- x = Y
1861:   if (col_identity) {
1862:     X = x;
1863:   } else {
1864:     X = factors->workVector;
1865:   }
1866:   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));

1868:   // x = C X; Reorder X with the inverse col permutation
1869:   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));

1871:   PetscCall(VecRestoreKokkosView(bb, &b));
1872:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1873:   PetscCall(PetscLogGpuTimeEnd());
1874:   PetscFunctionReturn(PETSC_SUCCESS);
1875: }

1877: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1878: // R and C are represented by rowperm and colperm in factors.
1879: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1880: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1881: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1882: {
1883:   auto                        exec    = PetscGetKokkosExecutionSpace();
1884:   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1885:   PetscInt                    m       = A->rmap->n;
1886:   PetscScalarKokkosView       X, Y, B; // alias
1887:   ConstPetscScalarKokkosView  b;
1888:   PetscScalarKokkosView       x;
1889:   PetscIntKokkosView         &rowperm      = factors->rowperm;
1890:   PetscIntKokkosView         &colperm      = factors->colperm;
1891:   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1892:   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;

1894:   PetscFunctionBegin;
1895:   PetscCall(PetscLogGpuTimeBegin());
1896:   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1897:   PetscCall(VecGetKokkosView(bb, &b));
1898:   PetscCall(VecGetKokkosViewWrite(xx, &x));

1900:   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1901:   if (col_identity) { // Reorder b with the col permutation
1902:     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1903:     Y = factors->workVector;
1904:   } else {
1905:     B = factors->workVector;
1906:     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1907:     Y = x;
1908:   }
1909:   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));

1911:   // Solve L^T X = Y
1912:   if (row_identity) {
1913:     X = x;
1914:   } else {
1915:     X = factors->workVector;
1916:   }
1917:   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));

1919:   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1920:   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));

1922:   PetscCall(VecRestoreKokkosView(bb, &b));
1923:   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1924:   PetscCall(PetscLogGpuTimeEnd());
1925:   PetscFunctionReturn(PETSC_SUCCESS);
1926: }

1928: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1929: {
1930:   PetscFunctionBegin;
1931:   PetscCall(MatSeqAIJKokkosSyncHost(A));
1932:   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));

1934:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1935:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1936:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1937:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1938:     const MatScalar            *Ba = b->a;
1939:     PetscInt                    m = B->rmap->n, n = B->cmap->n;

1941:     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1942:       // Allocate memory and copy the structure
1943:       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1944:       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1945:       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1946:       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1947:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1948:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));

1950:       PetscInt *Li = factors->iL_h.data();
1951:       PetscInt *Lj = factors->jL_h.data();
1952:       PetscInt *Ui = factors->iU_h.data();
1953:       PetscInt *Uj = factors->jU_h.data();

1955:       Li[0] = Ui[0] = 0;
1956:       for (PetscInt i = 0; i < m; i++) {
1957:         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1958:         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry

1960:         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1961:         Lj[Li[i] + llen] = i;                                   // diagonal entry of L

1963:         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1964:         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal

1966:         Li[i + 1] = Li[i] + llen + 1;
1967:         Ui[i + 1] = Ui[i] + ulen;
1968:       }

1970:       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1971:       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1972:       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1973:       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1974:       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1975:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);

1977:       // Copy row/col permutation to device
1978:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1979:       PetscBool row_identity;
1980:       PetscCall(ISIdentity(rowperm, &row_identity));
1981:       if (!row_identity) {
1982:         const PetscInt *ip;

1984:         PetscCall(ISGetIndices(rowperm, &ip));
1985:         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1986:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1987:         PetscCall(ISRestoreIndices(rowperm, &ip));
1988:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1989:       }

1991:       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1992:       PetscBool col_identity;
1993:       PetscCall(ISIdentity(colperm, &col_identity));
1994:       if (!col_identity) {
1995:         const PetscInt *ip;

1997:         PetscCall(ISGetIndices(colperm, &ip));
1998:         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1999:         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2000:         PetscCall(ISRestoreIndices(colperm, &ip));
2001:         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2002:       }

2004:       /* Create sptrsv handles for L, U and their transpose */
2005: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2006:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2007: #else
2008:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2009: #endif
2010:       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2011:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2012:       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2013:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2014:     }

2016:     // Copy the value
2017:     for (PetscInt i = 0; i < m; i++) {
2018:       PetscInt        llen = Bi[i + 1] - Bi[i];
2019:       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2020:       const PetscInt *Li   = factors->iL_h.data();
2021:       const PetscInt *Ui   = factors->iU_h.data();

2023:       PetscScalar *La = factors->aL_h.data();
2024:       PetscScalar *Ua = factors->aU_h.data();

2026:       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2027:       La[Li[i] + llen] = 1.0;                                 // diagonal entry

2029:       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2030:       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2031:     }

2033:     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2034:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2035:     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2036:     factors->transpose_updated         = PETSC_FALSE;
2037:     factors->sptrsv_symbolic_completed = PETSC_FALSE;

2039:     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2040:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2041:   }

2043:   B->ops->matsolve          = NULL;
2044:   B->ops->matsolvetranspose = NULL;
2045:   PetscFunctionReturn(PETSC_SUCCESS);
2046: }

2048: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2049: {
2050:   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2051:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2052:   PetscInt                    fill_lev = info->levels;

2054:   PetscFunctionBegin;
2055:   PetscCall(PetscLogGpuTimeBegin());
2056:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2057:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2059:   auto a_d = aijkok->a_dual.view_device();
2060:   auto i_d = aijkok->i_dual.view_device();
2061:   auto j_d = aijkok->j_dual.view_device();

2063:   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));

2065:   B->assembled              = PETSC_TRUE;
2066:   B->preallocated           = PETSC_TRUE;
2067:   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2068:   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2069:   B->ops->matsolve          = NULL;
2070:   B->ops->matsolvetranspose = NULL;

2072:   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2073:   factors->transpose_updated         = PETSC_FALSE;
2074:   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2075:   /* TODO: log flops, but how to know that? */
2076:   PetscCall(PetscLogGpuTimeEnd());
2077:   PetscFunctionReturn(PETSC_SUCCESS);
2078: }

2080: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2081: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2082: {
2083:   Mat_SeqAIJKokkos           *aijkok;
2084:   Mat_SeqAIJ                 *b;
2085:   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2086:   PetscInt                    fill_lev = info->levels;
2087:   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2088:   PetscInt                    n        = A->rmap->n;

2090:   PetscFunctionBegin;
2091:   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2092:   PetscCall(MatSeqAIJKokkosSyncDevice(A));

2094:   /* Create a spiluk handle and then do symbolic factorization */
2095:   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2096:   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);

2098:   auto spiluk_handle = factors->kh.get_spiluk_handle();

2100:   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2101:   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2102:   Kokkos::realloc(factors->iU_d, n + 1);
2103:   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());

2105:   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2106:   auto i_d = aijkok->i_dual.view_device();
2107:   auto j_d = aijkok->j_dual.view_device();
2108:   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2109:   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */

2111:   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2112:   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2113:   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2114:   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());

2116:   /* TODO: add options to select sptrsv algorithms */
2117:   /* Create sptrsv handles for L, U and their transpose */
2118: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2119:   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2120: #else
2121:   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2122: #endif

2124:   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2125:   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2126:   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2127:   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);

2129:   /* Fill fields of the factor matrix B */
2130:   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2131:   b     = (Mat_SeqAIJ *)B->data;
2132:   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2133:   B->info.fill_ratio_given  = info->fill;
2134:   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;

2136:   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2137:   PetscFunctionReturn(PETSC_SUCCESS);
2138: }

2140: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2141: {
2142:   PetscFunctionBegin;
2143:   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2144:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2145:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2146:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2147:   PetscFunctionReturn(PETSC_SUCCESS);
2148: }

2150: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2151: {
2152:   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;

2154:   PetscFunctionBegin;
2155:   if (!info->factoronhost) {
2156:     PetscCall(ISIdentity(isrow, &row_identity));
2157:     PetscCall(ISIdentity(iscol, &col_identity));
2158:   }

2160:   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2161:   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));

2163:   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2164:     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2165:   } else {
2166:     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2167:     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2168:   }
2169:   PetscFunctionReturn(PETSC_SUCCESS);
2170: }

2172: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2173: {
2174:   PetscFunctionBegin;
2175:   PetscCall(MatSeqAIJKokkosSyncHost(A));
2176:   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));

2178:   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2179:     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2180:     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2181:     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2182:     const MatScalar            *Ba = b->a;
2183:     PetscInt                    m  = B->rmap->n;

2185:     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2186:       // Allocate memory and copy the structure
2187:       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2188:       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2189:       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2190:       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2191:       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2192:       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);

2194:       // Build jU_h from the skewed Aj
2195:       PetscInt *Uj = factors->jU_h.data();
2196:       for (PetscInt i = 0; i < m; i++) {
2197:         PetscInt ulen = Bi[i + 1] - Bi[i];
2198:         Uj[Bi[i]]     = i;                                              // diagonal entry
2199:         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2200:       }

2202:       // Copy iU, jU to device
2203:       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2204:       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));

2206:       // Copy row/col permutation to device
2207:       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2208:       PetscBool row_identity;
2209:       PetscCall(ISIdentity(rowperm, &row_identity));
2210:       if (!row_identity) {
2211:         const PetscInt *ip;

2213:         PetscCall(ISGetIndices(rowperm, &ip));
2214:         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2215:         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2216:         PetscCall(ISRestoreIndices(rowperm, &ip));
2217:         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2218:       }

2220:       // Create sptrsv handles for U and U^T
2221: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2222:       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2223: #else
2224:       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2225: #endif
2226:       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2227:       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2228:     }
2229:     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2230:     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2231:     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;

2233:     // Copy the value
2234:     PetscScalar *Ua = factors->aU_h.data();
2235:     PetscScalar *D  = factors->D_h.data();
2236:     for (PetscInt i = 0; i < m; i++) {
2237:       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2238:       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2239:       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2240:     }
2241:     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2242:     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));

2244:     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2245:     factors->transpose_updated         = PETSC_FALSE;
2246:   }

2248:   B->ops->matsolve          = NULL;
2249:   B->ops->matsolvetranspose = NULL;
2250:   PetscFunctionReturn(PETSC_SUCCESS);
2251: }

2253: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2254: {
2255:   PetscFunctionBegin;
2256:   if (info->solveonhost) {
2257:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2258:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2259:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2260:   }

2262:   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));

2264:   if (!info->solveonhost) {
2265:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2266:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2267:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2268:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2269:   }
2270:   PetscFunctionReturn(PETSC_SUCCESS);
2271: }

2273: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2274: {
2275:   PetscFunctionBegin;
2276:   if (info->solveonhost) {
2277:     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2278:     PetscCall(MatSetType(B, MATSEQSBAIJ));
2279:     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2280:   }

2282:   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm

2284:   if (!info->solveonhost) {
2285:     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2286:     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2287:     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2288:     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2289:   }
2290:   PetscFunctionReturn(PETSC_SUCCESS);
2291: }

2293: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2294: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2295: {
2296:   PetscFunctionBegin;
2297:   *type = MATSOLVERKOKKOS;
2298:   PetscFunctionReturn(PETSC_SUCCESS);
2299: }

2301: /*MC
2302:   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2303:   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.

2305:   Level: beginner

2307: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2308: M*/
2309: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2310: {
2311:   PetscInt n = A->rmap->n;
2312:   MPI_Comm comm;

2314:   PetscFunctionBegin;
2315:   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2316:   PetscCall(MatCreate(comm, B));
2317:   PetscCall(MatSetSizes(*B, n, n, n, n));
2318:   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2319:   (*B)->factortype = ftype;
2320:   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2321:   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2322:   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");

2324:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2325:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2326:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2327:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2328:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2329:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2330:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2331:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2332:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2333:     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2334:     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2335:   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);

2337:   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2338:   (*B)->canuseordering = PETSC_TRUE;
2339:   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2340:   PetscFunctionReturn(PETSC_SUCCESS);
2341: }

2343: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2344: {
2345:   PetscFunctionBegin;
2346:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2347:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2348:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2349:   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2350:   PetscFunctionReturn(PETSC_SUCCESS);
2351: }

2353: /* Utility to print out a KokkosCsrMatrix for debugging */
2354: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2355: {
2356:   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2357:   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2358:   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2359:   const PetscInt    *i  = iv.data();
2360:   const PetscInt    *j  = jv.data();
2361:   const PetscScalar *a  = av.data();
2362:   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();

2364:   PetscFunctionBegin;
2365:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2366:   for (PetscInt k = 0; k < m; k++) {
2367:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2368:     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2369:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2370:   }
2371:   PetscFunctionReturn(PETSC_SUCCESS);
2372: }