Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsystypes.h>
9: #include <petscerror.h>
11: #include <Kokkos_Core.hpp>
12: #include <KokkosBlas.hpp>
13: #include <KokkosSparse_CrsMatrix.hpp>
15: // To suppress compiler warnings:
16: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
17: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
18: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
19: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
20: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
21: #include <KokkosSparse_spmv.hpp>
22: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
24: #include <KokkosSparse_spiluk.hpp>
25: #include <KokkosSparse_sptrsv.hpp>
26: #include <KokkosSparse_spgemm.hpp>
27: #include <KokkosSparse_spadd.hpp>
28: #include <KokkosBatched_LU_Decl.hpp>
29: #include <KokkosBatched_InverseLU_Decl.hpp>
31: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
33: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
34: #include <KokkosSparse_Utils.hpp>
35: using KokkosSparse::sort_crs_matrix;
36: using KokkosSparse::Impl::transpose_matrix;
37: #else
38: #include <KokkosKernels_Sorting.hpp>
39: using KokkosKernels::sort_crs_matrix;
40: using KokkosKernels::Impl::transpose_matrix;
41: #endif
43: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
44: using KokkosSparse::spiluk_symbolic;
45: using KokkosSparse::spiluk_numeric;
46: using KokkosSparse::sptrsv_symbolic;
47: using KokkosSparse::sptrsv_solve;
48: using KokkosSparse::Experimental::SPTRSVAlgorithm;
49: using KokkosSparse::Experimental::SPILUKAlgorithm;
50: #else
51: using KokkosSparse::Experimental::spiluk_symbolic;
52: using KokkosSparse::Experimental::spiluk_numeric;
53: using KokkosSparse::Experimental::sptrsv_symbolic;
54: using KokkosSparse::Experimental::sptrsv_solve;
55: using KokkosSparse::Experimental::SPTRSVAlgorithm;
56: using KokkosSparse::Experimental::SPILUKAlgorithm;
57: #endif
59: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
61: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
62: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
63: In the latter case, it is important to set a_dual's sync state correctly.
64: */
65: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
66: {
67: Mat_SeqAIJ *aijseq;
68: Mat_SeqAIJKokkos *aijkok;
70: PetscFunctionBegin;
71: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
72: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
74: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
75: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
77: /* If aijkok does not exist, we just copy i, j to device.
78: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
79: In both cases, we build a new aijkok structure.
80: */
81: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
82: delete aijkok;
83: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
84: A->spptr = aijkok;
85: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
86: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
87: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
88: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
89: }
90: PetscFunctionReturn(PETSC_SUCCESS);
91: }
93: /* Sync CSR data to device if not yet */
94: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
95: {
96: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
98: PetscFunctionBegin;
99: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
100: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
101: if (aijkok->a_dual.need_sync_device()) {
102: aijkok->a_dual.sync_device();
103: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
104: aijkok->hermitian_updated = PETSC_FALSE;
105: }
106: PetscFunctionReturn(PETSC_SUCCESS);
107: }
109: /* Mark the CSR data on device as modified */
110: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
111: {
112: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
114: PetscFunctionBegin;
115: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
116: aijkok->a_dual.clear_sync_state();
117: aijkok->a_dual.modify_device();
118: aijkok->transpose_updated = PETSC_FALSE;
119: aijkok->hermitian_updated = PETSC_FALSE;
120: PetscCall(MatSeqAIJInvalidateDiagonal(A));
121: PetscCall(PetscObjectStateIncrease((PetscObject)A));
122: PetscFunctionReturn(PETSC_SUCCESS);
123: }
125: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
126: {
127: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
128: auto exec = PetscGetKokkosExecutionSpace();
130: PetscFunctionBegin;
131: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
132: /* We do not expect one needs factors on host */
133: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
134: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
135: PetscCall(KokkosDualViewSync<HostMirrorMemorySpace>(aijkok->a_dual, exec));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
140: {
141: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143: PetscFunctionBegin;
144: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
145: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
146: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
147: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
148: */
149: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
150: auto exec = PetscGetKokkosExecutionSpace();
151: PetscCallCXX(aijkok->a_dual.sync_host(exec));
152: PetscCallCXX(exec.fence());
153: *array = aijkok->a_dual.view_host().data();
154: } else { /* Happens when calling MatSetValues on a newly created matrix */
155: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
156: }
157: PetscFunctionReturn(PETSC_SUCCESS);
158: }
160: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
161: {
162: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
164: PetscFunctionBegin;
165: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
166: PetscFunctionReturn(PETSC_SUCCESS);
167: }
169: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
170: {
171: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
173: PetscFunctionBegin;
174: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
175: auto exec = PetscGetKokkosExecutionSpace();
176: PetscCallCXX(aijkok->a_dual.sync_host(exec));
177: PetscCallCXX(exec.fence());
178: *array = aijkok->a_dual.view_host().data();
179: } else {
180: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
181: }
182: PetscFunctionReturn(PETSC_SUCCESS);
183: }
185: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
186: {
187: PetscFunctionBegin;
188: *array = NULL;
189: PetscFunctionReturn(PETSC_SUCCESS);
190: }
192: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
193: {
194: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
196: PetscFunctionBegin;
197: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
198: *array = aijkok->a_dual.view_host().data();
199: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
200: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
201: }
202: PetscFunctionReturn(PETSC_SUCCESS);
203: }
205: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
206: {
207: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
209: PetscFunctionBegin;
210: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
211: aijkok->a_dual.clear_sync_state();
212: aijkok->a_dual.modify_host();
213: }
214: PetscFunctionReturn(PETSC_SUCCESS);
215: }
217: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
218: {
219: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
221: PetscFunctionBegin;
222: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
224: if (i) *i = aijkok->i_device_data();
225: if (j) *j = aijkok->j_device_data();
226: if (a) {
227: aijkok->a_dual.sync_device();
228: *a = aijkok->a_device_data();
229: }
230: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
231: PetscFunctionReturn(PETSC_SUCCESS);
232: }
234: /*
235: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
237: Input Parameter:
238: . A - the MATSEQAIJKOKKOS matrix
240: Output Parameters:
241: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
242: - T_d - the transpose on device, whose value array is allocated but not initialized
243: */
244: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
245: {
246: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
247: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
248: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
249: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
250: MatRowMapType *Ti = Ti_h.data();
251: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
252: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
253: PetscInt *Tj = Tj_h.data();
254: PetscInt *perm = perm_h.data();
255: PetscInt *offset;
257: PetscFunctionBegin;
258: // Populate Ti
259: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
260: Ti++;
261: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
262: Ti--;
263: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
265: // Populate Tj and the permutation array
266: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
267: for (PetscInt i = 0; i < m; i++) {
268: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
269: PetscInt r = Aj[j]; // row r of T
270: PetscInt disp = Ti[r] + offset[r];
272: Tj[disp] = i; // col i of T
273: perm[disp] = j;
274: offset[r]++;
275: }
276: }
277: PetscCall(PetscFree(offset));
279: // Sort each row of T, along with the permutation array
280: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
282: // Output perm and T on device
283: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
284: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
285: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
286: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
287: PetscFunctionReturn(PETSC_SUCCESS);
288: }
290: // Generate the transpose on device and cache it internally
291: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
292: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
293: {
294: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
295: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
296: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
297: KokkosCsrMatrix &T = akok->csrmatT;
299: PetscFunctionBegin;
300: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
301: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
303: const auto &Aa = akok->a_dual.view_device();
305: if (A->symmetric == PETSC_BOOL3_TRUE) {
306: *csrmatT = akok->csrmat;
307: } else {
308: // See if we already have a cached transpose and its value is up to date
309: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
310: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
311: const auto &perm = akok->transpose_perm; // get the permutation array
312: auto &Ta = T.values;
314: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
315: }
316: } else { // Generate T of size n x m for the first time
317: MatRowMapKokkosView perm;
319: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
320: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
321: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
322: }
323: akok->transpose_updated = PETSC_TRUE;
324: *csrmatT = akok->csrmatT;
325: }
326: PetscFunctionReturn(PETSC_SUCCESS);
327: }
329: // Generate the Hermitian on device and cache it internally
330: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
331: {
332: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
333: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
334: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
335: KokkosCsrMatrix &T = akok->csrmatH;
337: PetscFunctionBegin;
338: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
339: PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device
341: const auto &Aa = akok->a_dual.view_device();
343: if (A->hermitian == PETSC_BOOL3_TRUE) {
344: *csrmatH = akok->csrmat;
345: } else {
346: // See if we already have a cached hermitian and its value is up to date
347: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
348: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
349: const auto &perm = akok->transpose_perm; // get the permutation array
350: auto &Ta = T.values;
352: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
353: }
354: } else { // Generate T of size n x m for the first time
355: MatRowMapKokkosView perm;
357: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
358: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
359: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
360: }
361: akok->hermitian_updated = PETSC_TRUE;
362: *csrmatH = akok->csrmatH;
363: }
364: PetscFunctionReturn(PETSC_SUCCESS);
365: }
367: /* y = A x */
368: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
369: {
370: Mat_SeqAIJKokkos *aijkok;
371: ConstPetscScalarKokkosView xv;
372: PetscScalarKokkosView yv;
374: PetscFunctionBegin;
375: PetscCall(PetscLogGpuTimeBegin());
376: PetscCall(MatSeqAIJKokkosSyncDevice(A));
377: PetscCall(VecGetKokkosView(xx, &xv));
378: PetscCall(VecGetKokkosViewWrite(yy, &yv));
379: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
380: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
381: PetscCall(VecRestoreKokkosView(xx, &xv));
382: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
383: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
384: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
385: PetscCall(PetscLogGpuTimeEnd());
386: PetscFunctionReturn(PETSC_SUCCESS);
387: }
389: /* y = A^T x */
390: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
391: {
392: Mat_SeqAIJKokkos *aijkok;
393: const char *mode;
394: ConstPetscScalarKokkosView xv;
395: PetscScalarKokkosView yv;
396: KokkosCsrMatrix csrmat;
398: PetscFunctionBegin;
399: PetscCall(PetscLogGpuTimeBegin());
400: PetscCall(MatSeqAIJKokkosSyncDevice(A));
401: PetscCall(VecGetKokkosView(xx, &xv));
402: PetscCall(VecGetKokkosViewWrite(yy, &yv));
403: if (A->form_explicit_transpose) {
404: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
405: mode = "N";
406: } else {
407: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
408: csrmat = aijkok->csrmat;
409: mode = "T";
410: }
411: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
412: PetscCall(VecRestoreKokkosView(xx, &xv));
413: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
414: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
415: PetscCall(PetscLogGpuTimeEnd());
416: PetscFunctionReturn(PETSC_SUCCESS);
417: }
419: /* y = A^H x */
420: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
421: {
422: Mat_SeqAIJKokkos *aijkok;
423: const char *mode;
424: ConstPetscScalarKokkosView xv;
425: PetscScalarKokkosView yv;
426: KokkosCsrMatrix csrmat;
428: PetscFunctionBegin;
429: PetscCall(PetscLogGpuTimeBegin());
430: PetscCall(MatSeqAIJKokkosSyncDevice(A));
431: PetscCall(VecGetKokkosView(xx, &xv));
432: PetscCall(VecGetKokkosViewWrite(yy, &yv));
433: if (A->form_explicit_transpose) {
434: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
435: mode = "N";
436: } else {
437: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
438: csrmat = aijkok->csrmat;
439: mode = "C";
440: }
441: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
442: PetscCall(VecRestoreKokkosView(xx, &xv));
443: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
444: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
445: PetscCall(PetscLogGpuTimeEnd());
446: PetscFunctionReturn(PETSC_SUCCESS);
447: }
449: /* z = A x + y */
450: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
451: {
452: Mat_SeqAIJKokkos *aijkok;
453: ConstPetscScalarKokkosView xv;
454: PetscScalarKokkosView zv;
456: PetscFunctionBegin;
457: PetscCall(PetscLogGpuTimeBegin());
458: PetscCall(MatSeqAIJKokkosSyncDevice(A));
459: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
460: PetscCall(VecGetKokkosView(xx, &xv));
461: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
462: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
463: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
464: PetscCall(VecRestoreKokkosView(xx, &xv));
465: PetscCall(VecRestoreKokkosView(zz, &zv));
466: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
467: PetscCall(PetscLogGpuTimeEnd());
468: PetscFunctionReturn(PETSC_SUCCESS);
469: }
471: /* z = A^T x + y */
472: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
473: {
474: Mat_SeqAIJKokkos *aijkok;
475: const char *mode;
476: ConstPetscScalarKokkosView xv;
477: PetscScalarKokkosView zv;
478: KokkosCsrMatrix csrmat;
480: PetscFunctionBegin;
481: PetscCall(PetscLogGpuTimeBegin());
482: PetscCall(MatSeqAIJKokkosSyncDevice(A));
483: if (zz != yy) PetscCall(VecCopy(yy, zz));
484: PetscCall(VecGetKokkosView(xx, &xv));
485: PetscCall(VecGetKokkosView(zz, &zv));
486: if (A->form_explicit_transpose) {
487: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
488: mode = "N";
489: } else {
490: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
491: csrmat = aijkok->csrmat;
492: mode = "T";
493: }
494: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
495: PetscCall(VecRestoreKokkosView(xx, &xv));
496: PetscCall(VecRestoreKokkosView(zz, &zv));
497: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
498: PetscCall(PetscLogGpuTimeEnd());
499: PetscFunctionReturn(PETSC_SUCCESS);
500: }
502: /* z = A^H x + y */
503: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
504: {
505: Mat_SeqAIJKokkos *aijkok;
506: const char *mode;
507: ConstPetscScalarKokkosView xv;
508: PetscScalarKokkosView zv;
509: KokkosCsrMatrix csrmat;
511: PetscFunctionBegin;
512: PetscCall(PetscLogGpuTimeBegin());
513: PetscCall(MatSeqAIJKokkosSyncDevice(A));
514: if (zz != yy) PetscCall(VecCopy(yy, zz));
515: PetscCall(VecGetKokkosView(xx, &xv));
516: PetscCall(VecGetKokkosView(zz, &zv));
517: if (A->form_explicit_transpose) {
518: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
519: mode = "N";
520: } else {
521: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
522: csrmat = aijkok->csrmat;
523: mode = "C";
524: }
525: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
526: PetscCall(VecRestoreKokkosView(xx, &xv));
527: PetscCall(VecRestoreKokkosView(zz, &zv));
528: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
529: PetscCall(PetscLogGpuTimeEnd());
530: PetscFunctionReturn(PETSC_SUCCESS);
531: }
533: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
534: {
535: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
537: PetscFunctionBegin;
538: switch (op) {
539: case MAT_FORM_EXPLICIT_TRANSPOSE:
540: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
541: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
542: A->form_explicit_transpose = flg;
543: break;
544: default:
545: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
546: break;
547: }
548: PetscFunctionReturn(PETSC_SUCCESS);
549: }
551: /* Depending on reuse, either build a new mat, or use the existing mat */
552: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
553: {
554: Mat_SeqAIJ *aseq;
556: PetscFunctionBegin;
557: PetscCall(PetscKokkosInitializeCheck());
558: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
559: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */
560: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
561: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
562: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
563: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
564: PetscCall(PetscFree(A->defaultvectype));
565: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
566: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
567: PetscCall(MatSetOps_SeqAIJKokkos(A));
568: aseq = static_cast<Mat_SeqAIJ *>(A->data);
569: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
570: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
571: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
572: }
573: }
574: PetscFunctionReturn(PETSC_SUCCESS);
575: }
577: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
578: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
579: */
580: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
581: {
582: Mat_SeqAIJ *bseq;
583: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
584: Mat mat;
586: PetscFunctionBegin;
587: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
588: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
589: mat = *B;
590: if (A->assembled) {
591: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
592: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
593: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
594: /* Now copy values to B if needed */
595: if (dupOption == MAT_COPY_VALUES) {
596: if (akok->a_dual.need_sync_device()) {
597: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
598: bkok->a_dual.modify_host();
599: } else { /* If device has the latest data, we only copy data on device */
600: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
601: bkok->a_dual.modify_device();
602: }
603: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
604: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
605: bkok->a_dual.modify_host();
606: }
607: mat->spptr = bkok;
608: }
610: PetscCall(PetscFree(mat->defaultvectype));
611: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
612: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
613: PetscCall(MatSetOps_SeqAIJKokkos(mat));
614: PetscFunctionReturn(PETSC_SUCCESS);
615: }
617: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
618: {
619: Mat At;
620: KokkosCsrMatrix internT;
621: Mat_SeqAIJKokkos *atkok, *bkok;
623: PetscFunctionBegin;
624: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
625: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
626: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
627: /* Deep copy internT, as we want to isolate the internal transpose */
628: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
629: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
630: if (reuse == MAT_INITIAL_MATRIX) *B = At;
631: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
632: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
633: if ((*B)->assembled) {
634: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
635: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
636: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
637: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
638: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
639: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
640: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
641: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
642: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
643: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
644: }
645: PetscFunctionReturn(PETSC_SUCCESS);
646: }
648: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
649: {
650: Mat_SeqAIJKokkos *aijkok;
652: PetscFunctionBegin;
653: if (A->factortype == MAT_FACTOR_NONE) {
654: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
655: delete aijkok;
656: } else {
657: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
658: }
659: A->spptr = NULL;
660: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
661: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
662: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
663: #if defined(PETSC_HAVE_HYPRE)
664: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
665: #endif
666: PetscCall(MatDestroy_SeqAIJ(A));
667: PetscFunctionReturn(PETSC_SUCCESS);
668: }
670: /*MC
671: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
673: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
675: Options Database Key:
676: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
678: Level: beginner
680: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
681: M*/
682: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
683: {
684: PetscFunctionBegin;
685: PetscCall(PetscKokkosInitializeCheck());
686: PetscCall(MatCreate_SeqAIJ(A));
687: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
688: PetscFunctionReturn(PETSC_SUCCESS);
689: }
691: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
692: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
693: {
694: Mat_SeqAIJ *a, *b;
695: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
696: MatScalarKokkosView aa, ba, ca;
697: MatRowMapKokkosView ai, bi, ci;
698: MatColIdxKokkosView aj, bj, cj;
699: PetscInt m, n, nnz, aN;
701: PetscFunctionBegin;
704: PetscAssertPointer(C, 4);
705: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
706: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
707: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
708: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
710: PetscCall(MatSeqAIJKokkosSyncDevice(A));
711: PetscCall(MatSeqAIJKokkosSyncDevice(B));
712: a = static_cast<Mat_SeqAIJ *>(A->data);
713: b = static_cast<Mat_SeqAIJ *>(B->data);
714: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
715: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
716: aa = akok->a_dual.view_device();
717: ai = akok->i_dual.view_device();
718: ba = bkok->a_dual.view_device();
719: bi = bkok->i_dual.view_device();
720: m = A->rmap->n; /* M, N and nnz of C */
721: n = A->cmap->n + B->cmap->n;
722: nnz = a->nz + b->nz;
723: aN = A->cmap->n; /* N of A */
724: if (reuse == MAT_INITIAL_MATRIX) {
725: aj = akok->j_dual.view_device();
726: bj = bkok->j_dual.view_device();
727: auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0));
728: auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0));
729: auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0));
730: ca = ca_dual.view_device();
731: ci = ci_dual.view_device();
732: cj = cj_dual.view_device();
734: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
735: Kokkos::parallel_for(
736: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
737: PetscInt i = t.league_rank(); /* row i */
738: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
740: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
741: ci(i) = coffset;
742: if (i == m - 1) ci(m) = ai(m) + bi(m);
743: });
745: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
746: if (k < alen) {
747: ca(coffset + k) = aa(ai(i) + k);
748: cj(coffset + k) = aj(ai(i) + k);
749: } else {
750: ca(coffset + k) = ba(bi(i) + k - alen);
751: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
752: }
753: });
754: });
755: ca_dual.modify_device();
756: ci_dual.modify_device();
757: cj_dual.modify_device();
758: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual));
759: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
760: } else if (reuse == MAT_REUSE_MATRIX) {
762: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
763: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
764: ca = ckok->a_dual.view_device();
765: ci = ckok->i_dual.view_device();
767: Kokkos::parallel_for(
768: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
769: PetscInt i = t.league_rank(); /* row i */
770: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
771: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
772: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
773: else ca(ci(i) + k) = ba(bi(i) + k - alen);
774: });
775: });
776: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
777: }
778: PetscFunctionReturn(PETSC_SUCCESS);
779: }
781: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
782: {
783: PetscFunctionBegin;
784: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
785: PetscFunctionReturn(PETSC_SUCCESS);
786: }
788: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
789: {
790: Mat_Product *product = C->product;
791: Mat A, B;
792: bool transA, transB; /* use bool, since KK needs this type */
793: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
794: Mat_SeqAIJ *c;
795: MatProductData_SeqAIJKokkos *pdata;
796: KokkosCsrMatrix csrmatA, csrmatB;
798: PetscFunctionBegin;
799: MatCheckProduct(C, 1);
800: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
801: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
803: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
804: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
805: // we still do numeric.
806: if (pdata->reusesym) { // numeric reuses results from symbolic
807: pdata->reusesym = PETSC_FALSE;
808: PetscFunctionReturn(PETSC_SUCCESS);
809: }
811: switch (product->type) {
812: case MATPRODUCT_AB:
813: transA = false;
814: transB = false;
815: break;
816: case MATPRODUCT_AtB:
817: transA = true;
818: transB = false;
819: break;
820: case MATPRODUCT_ABt:
821: transA = false;
822: transB = true;
823: break;
824: default:
825: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
826: }
828: A = product->A;
829: B = product->B;
830: PetscCall(MatSeqAIJKokkosSyncDevice(A));
831: PetscCall(MatSeqAIJKokkosSyncDevice(B));
832: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
833: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
834: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
836: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
838: csrmatA = akok->csrmat;
839: csrmatB = bkok->csrmat;
841: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
842: if (transA) {
843: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
844: transA = false;
845: }
847: if (transB) {
848: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
849: transB = false;
850: }
851: PetscCall(PetscLogGpuTimeBegin());
852: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
853: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
854: auto spgemmHandle = pdata->kh.get_spgemm_handle();
855: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
856: #endif
858: PetscCall(PetscLogGpuTimeEnd());
859: PetscCall(MatSeqAIJKokkosModifyDevice(C));
860: /* shorter version of MatAssemblyEnd_SeqAIJ */
861: c = (Mat_SeqAIJ *)C->data;
862: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
863: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
864: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
865: c->reallocs = 0;
866: C->info.mallocs = 0;
867: C->info.nz_unneeded = 0;
868: C->assembled = C->was_assembled = PETSC_TRUE;
869: C->num_ass++;
870: PetscFunctionReturn(PETSC_SUCCESS);
871: }
873: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
874: {
875: Mat_Product *product = C->product;
876: MatProductType ptype;
877: Mat A, B;
878: bool transA, transB;
879: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
880: MatProductData_SeqAIJKokkos *pdata;
881: MPI_Comm comm;
882: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
884: PetscFunctionBegin;
885: MatCheckProduct(C, 1);
886: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
887: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
888: A = product->A;
889: B = product->B;
890: PetscCall(MatSeqAIJKokkosSyncDevice(A));
891: PetscCall(MatSeqAIJKokkosSyncDevice(B));
892: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
893: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
894: csrmatA = akok->csrmat;
895: csrmatB = bkok->csrmat;
897: ptype = product->type;
898: // Take advantage of the symmetry if true
899: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
900: ptype = MATPRODUCT_AB;
901: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
902: }
903: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
904: ptype = MATPRODUCT_AB;
905: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
906: }
908: switch (ptype) {
909: case MATPRODUCT_AB:
910: transA = false;
911: transB = false;
912: PetscCall(MatSetBlockSizesFromMats(C, A, B));
913: break;
914: case MATPRODUCT_AtB:
915: transA = true;
916: transB = false;
917: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
918: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
919: break;
920: case MATPRODUCT_ABt:
921: transA = false;
922: transB = true;
923: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
924: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
925: break;
926: default:
927: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
928: }
929: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
930: pdata->reusesym = product->api_user;
932: /* TODO: add command line options to select spgemm algorithms */
933: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
935: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
936: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
937: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
938: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
939: #endif
940: #endif
941: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
943: PetscCall(PetscLogGpuTimeBegin());
944: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
945: if (transA) {
946: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
947: transA = false;
948: }
950: if (transB) {
951: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
952: transB = false;
953: }
955: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
956: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
957: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
958: calling new Mat_SeqAIJKokkos().
959: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
960: */
961: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
962: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
963: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
964: auto spgemmHandle = pdata->kh.get_spgemm_handle();
965: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
966: #endif
967: PetscCall(PetscLogGpuTimeEnd());
969: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
970: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
971: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
972: PetscFunctionReturn(PETSC_SUCCESS);
973: }
975: /* handles sparse matrix matrix ops */
976: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
977: {
978: Mat_Product *product = mat->product;
979: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
981: PetscFunctionBegin;
982: MatCheckProduct(mat, 1);
983: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
984: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
985: if (Biskok && Ciskok) {
986: switch (product->type) {
987: case MATPRODUCT_AB:
988: case MATPRODUCT_AtB:
989: case MATPRODUCT_ABt:
990: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
991: break;
992: case MATPRODUCT_PtAP:
993: case MATPRODUCT_RARt:
994: case MATPRODUCT_ABC:
995: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
996: break;
997: default:
998: break;
999: }
1000: } else { /* fallback for AIJ */
1001: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1002: }
1003: PetscFunctionReturn(PETSC_SUCCESS);
1004: }
1006: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1007: {
1008: Mat_SeqAIJKokkos *aijkok;
1010: PetscFunctionBegin;
1011: PetscCall(PetscLogGpuTimeBegin());
1012: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1013: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1014: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1015: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1016: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1017: PetscCall(PetscLogGpuTimeEnd());
1018: PetscFunctionReturn(PETSC_SUCCESS);
1019: }
1021: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1022: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1023: {
1024: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1026: PetscFunctionBegin;
1027: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1028: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1030: PetscCall(PetscLogGpuTimeBegin());
1031: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1032: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1033: const auto &Aa = aijkok->a_dual.view_device();
1034: const auto &Adiag = aijkok->diag_dual.view_device();
1035: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1036: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1037: PetscCall(PetscLogGpuFlops(n));
1038: PetscCall(PetscLogGpuTimeEnd());
1039: } else { // need reassembly, very slow!
1040: PetscCall(MatShift_Basic(A, a));
1041: }
1042: PetscFunctionReturn(PETSC_SUCCESS);
1043: }
1045: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1046: {
1047: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1049: PetscFunctionBegin;
1050: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1051: ConstPetscScalarKokkosView dv;
1052: PetscInt n, nv;
1054: PetscCall(PetscLogGpuTimeBegin());
1055: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1056: PetscCall(VecGetKokkosView(D, &dv));
1057: PetscCall(VecGetLocalSize(D, &nv));
1058: n = PetscMin(Y->rmap->n, Y->cmap->n);
1059: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1061: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1062: const auto &Aa = aijkok->a_dual.view_device();
1063: const auto &Adiag = aijkok->diag_dual.view_device();
1064: PetscCallCXX(Kokkos::parallel_for(
1065: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1066: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1067: else Aa(Adiag(i)) += dv(i);
1068: }));
1069: PetscCall(VecRestoreKokkosView(D, &dv));
1070: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1071: PetscCall(PetscLogGpuFlops(n));
1072: PetscCall(PetscLogGpuTimeEnd());
1073: } else { // need reassembly, very slow!
1074: PetscCall(MatDiagonalSet_Default(Y, D, is));
1075: }
1076: PetscFunctionReturn(PETSC_SUCCESS);
1077: }
1079: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1080: {
1081: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1082: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1083: ConstPetscScalarKokkosView lv, rv;
1085: PetscFunctionBegin;
1086: PetscCall(PetscLogGpuTimeBegin());
1087: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1088: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1089: const auto &Aa = aijkok->a_dual.view_device();
1090: const auto &Ai = aijkok->i_dual.view_device();
1091: const auto &Aj = aijkok->j_dual.view_device();
1092: if (ll) {
1093: PetscCall(VecGetLocalSize(ll, &m));
1094: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1095: PetscCall(VecGetKokkosView(ll, &lv));
1096: PetscCallCXX(Kokkos::parallel_for( // for each row
1097: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1098: PetscInt i = t.league_rank(); // row i
1099: PetscInt len = Ai(i + 1) - Ai(i);
1100: // scale entries on the row
1101: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1102: }));
1103: PetscCall(VecRestoreKokkosView(ll, &lv));
1104: PetscCall(PetscLogGpuFlops(nz));
1105: }
1106: if (rr) {
1107: PetscCall(VecGetLocalSize(rr, &n));
1108: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1109: PetscCall(VecGetKokkosView(rr, &rv));
1110: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1111: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1112: PetscCall(VecRestoreKokkosView(rr, &lv));
1113: PetscCall(PetscLogGpuFlops(nz));
1114: }
1115: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1116: PetscCall(PetscLogGpuTimeEnd());
1117: PetscFunctionReturn(PETSC_SUCCESS);
1118: }
1120: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1121: {
1122: Mat_SeqAIJKokkos *aijkok;
1124: PetscFunctionBegin;
1125: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1126: if (aijkok) { /* Only zero the device if data is already there */
1127: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1128: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129: } else { /* Might be preallocated but not assembled */
1130: PetscCall(MatZeroEntries_SeqAIJ(A));
1131: }
1132: PetscFunctionReturn(PETSC_SUCCESS);
1133: }
1135: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1136: {
1137: Mat_SeqAIJKokkos *aijkok;
1138: PetscInt n;
1139: PetscScalarKokkosView xv;
1141: PetscFunctionBegin;
1142: PetscCall(VecGetLocalSize(x, &n));
1143: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1144: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1146: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1147: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1149: const auto &Aa = aijkok->a_dual.view_device();
1150: const auto &Ai = aijkok->i_dual.view_device();
1151: const auto &Adiag = aijkok->diag_dual.view_device();
1153: PetscCall(VecGetKokkosViewWrite(x, &xv));
1154: Kokkos::parallel_for(
1155: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1156: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1157: else xv(i) = 0;
1158: });
1159: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1160: PetscFunctionReturn(PETSC_SUCCESS);
1161: }
1163: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1164: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1165: {
1166: Mat_SeqAIJKokkos *aijkok;
1168: PetscFunctionBegin;
1170: PetscAssertPointer(kv, 2);
1171: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1172: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1173: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1174: *kv = aijkok->a_dual.view_device();
1175: PetscFunctionReturn(PETSC_SUCCESS);
1176: }
1178: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1179: {
1180: PetscFunctionBegin;
1182: PetscAssertPointer(kv, 2);
1183: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1184: PetscFunctionReturn(PETSC_SUCCESS);
1185: }
1187: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1188: {
1189: Mat_SeqAIJKokkos *aijkok;
1191: PetscFunctionBegin;
1193: PetscAssertPointer(kv, 2);
1194: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1195: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1196: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1197: *kv = aijkok->a_dual.view_device();
1198: PetscFunctionReturn(PETSC_SUCCESS);
1199: }
1201: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1202: {
1203: PetscFunctionBegin;
1205: PetscAssertPointer(kv, 2);
1206: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1207: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1208: PetscFunctionReturn(PETSC_SUCCESS);
1209: }
1211: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1212: {
1213: Mat_SeqAIJKokkos *aijkok;
1215: PetscFunctionBegin;
1217: PetscAssertPointer(kv, 2);
1218: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1219: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1220: *kv = aijkok->a_dual.view_device();
1221: PetscFunctionReturn(PETSC_SUCCESS);
1222: }
1224: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225: {
1226: PetscFunctionBegin;
1228: PetscAssertPointer(kv, 2);
1229: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1230: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1231: PetscFunctionReturn(PETSC_SUCCESS);
1232: }
1234: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1235: {
1236: Mat_SeqAIJKokkos *akok;
1238: PetscFunctionBegin;
1239: auto exec = PetscGetKokkosExecutionSpace();
1240: // Create host copies of the input aij
1241: auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d);
1242: auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d);
1243: // Don't copy the vals to the host now
1244: auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d);
1246: MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h);
1247: // Note we have modified device data so it will copy lazily
1248: a_dual.modify_device();
1249: MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h);
1250: MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h);
1252: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual));
1253: PetscCall(MatCreate(comm, A));
1254: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255: PetscFunctionReturn(PETSC_SUCCESS);
1256: }
1258: /* Computes Y += alpha X */
1259: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260: {
1261: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1263: ConstMatScalarKokkosView Xa;
1264: MatScalarKokkosView Ya;
1265: auto exec = PetscGetKokkosExecutionSpace();
1267: PetscFunctionBegin;
1268: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272: PetscCall(PetscLogGpuTimeBegin());
1274: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275: PetscBool e;
1276: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277: if (e) {
1278: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279: if (e) pattern = SAME_NONZERO_PATTERN;
1280: }
1281: }
1283: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287: */
1288: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290: Xa = xkok->a_dual.view_device();
1291: Ya = ykok->a_dual.view_device();
1293: if (pattern == SAME_NONZERO_PATTERN) {
1294: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1300: Kokkos::parallel_for(
1301: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302: PetscInt i = t.league_rank(); // row i
1303: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304: // Only one thread works in a team
1305: PetscInt p, q = Yi(i);
1306: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1307: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308: if (Xj(p) == Yj(q)) { // Found it
1309: Ya(q) += alpha * Xa(p);
1310: q++;
1311: } else {
1312: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1315: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1316: #else
1317: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1318: #endif
1319: }
1320: }
1321: });
1322: });
1323: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1324: } else { // different nonzero patterns
1325: Mat Z;
1326: KokkosCsrMatrix zcsr;
1327: KernelHandle kh;
1328: kh.create_spadd_handle(true); // X, Y are sorted
1329: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1330: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1331: zkok = new Mat_SeqAIJKokkos(zcsr);
1332: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1333: PetscCall(MatHeaderReplace(Y, &Z));
1334: kh.destroy_spadd_handle();
1335: }
1336: PetscCall(PetscLogGpuTimeEnd());
1337: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1338: PetscFunctionReturn(PETSC_SUCCESS);
1339: }
1341: struct MatCOOStruct_SeqAIJKokkos {
1342: PetscCount n;
1343: PetscCount Atot;
1344: PetscInt nz;
1345: PetscCountKokkosView jmap;
1346: PetscCountKokkosView perm;
1348: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1349: {
1350: nz = coo_h->nz;
1351: n = coo_h->n;
1352: Atot = coo_h->Atot;
1353: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1354: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1355: }
1356: };
1358: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1359: {
1360: PetscFunctionBegin;
1361: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1362: PetscFunctionReturn(PETSC_SUCCESS);
1363: }
1365: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1366: {
1367: Mat_SeqAIJKokkos *akok;
1368: Mat_SeqAIJ *aseq;
1369: PetscContainer container_h;
1370: MatCOOStruct_SeqAIJ *coo_h;
1371: MatCOOStruct_SeqAIJKokkos *coo_d;
1373: PetscFunctionBegin;
1374: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1375: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1376: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1377: delete akok;
1378: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1379: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1381: // Copy the COO struct to device
1382: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1383: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1384: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1386: // Put the COO struct in a container and then attach that to the matrix
1387: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1388: PetscFunctionReturn(PETSC_SUCCESS);
1389: }
1391: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1392: {
1393: MatScalarKokkosView Aa;
1394: ConstMatScalarKokkosView kv;
1395: PetscMemType memtype;
1396: PetscContainer container;
1397: MatCOOStruct_SeqAIJKokkos *coo;
1399: PetscFunctionBegin;
1400: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1401: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1403: const auto &n = coo->n;
1404: const auto &Annz = coo->nz;
1405: const auto &jmap = coo->jmap;
1406: const auto &perm = coo->perm;
1408: PetscCall(PetscGetMemType(v, &memtype));
1409: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1410: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1411: } else {
1412: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1413: }
1415: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1416: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1418: PetscCall(PetscLogGpuTimeBegin());
1419: Kokkos::parallel_for(
1420: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1421: PetscScalar sum = 0.0;
1422: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1423: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1424: });
1425: PetscCall(PetscLogGpuTimeEnd());
1427: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1428: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1429: PetscFunctionReturn(PETSC_SUCCESS);
1430: }
1432: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1433: {
1434: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1436: PetscFunctionBegin;
1437: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1438: A->boundtocpu = PETSC_FALSE;
1440: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1441: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1442: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1443: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1444: A->ops->scale = MatScale_SeqAIJKokkos;
1445: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1446: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1447: A->ops->mult = MatMult_SeqAIJKokkos;
1448: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1449: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1450: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1451: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1452: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1453: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1454: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1455: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1456: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1457: A->ops->shift = MatShift_SeqAIJKokkos;
1458: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1459: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1460: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1461: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1462: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1463: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1464: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1465: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1466: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1468: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1469: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1470: #if defined(PETSC_HAVE_HYPRE)
1471: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1472: #endif
1473: PetscFunctionReturn(PETSC_SUCCESS);
1474: }
1476: /*
1477: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1479: Input Parameters:
1480: + A - the MATSEQAIJKOKKOS matrix
1481: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1482: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1483: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1484: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1486: Output Parameter:
1487: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1488: */
1489: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1490: {
1491: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1492: PetscInt nblocks = bs.extent(0) - 1;
1494: PetscFunctionBegin;
1495: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1497: // Pull out the diagonal blocks of the matrix and then invert the blocks
1498: auto Aa = akok->a_dual.view_device();
1499: auto Ai = akok->i_dual.view_device();
1500: auto Aj = akok->j_dual.view_device();
1501: auto Adiag = akok->diag_dual.view_device();
1502: // TODO: how to tune the team size?
1503: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1504: auto ts = Kokkos::AUTO();
1505: #else
1506: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1507: #endif
1508: PetscCallCXX(Kokkos::parallel_for(
1509: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1510: const PetscInt bid = teamMember.league_rank(); // block id
1511: const PetscInt rstart = bs(bid); // this block starts from this row
1512: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1513: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1514: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1516: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1517: PetscInt i = rstart + r; // i-th row in A
1519: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1520: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1522: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1523: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1524: B(r, c) = 0.0;
1525: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1526: B(r, c) = Aa(first + c);
1527: } else { // this entry does not show up in the CSR
1528: B(r, c) = 0.0;
1529: }
1530: }
1531: } else { // rare case that the diagonal does not exist
1532: const PetscInt begin = Ai(i);
1533: const PetscInt end = Ai(i + 1);
1534: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1535: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1536: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1537: else if (Aj(j) >= rstart + m) break;
1538: }
1539: }
1540: });
1542: // LU-decompose B (w/o pivoting) and then invert B
1543: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1544: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1545: }));
1546: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1547: PetscFunctionReturn(PETSC_SUCCESS);
1548: }
1550: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1551: {
1552: Mat_SeqAIJ *aseq;
1553: PetscInt i, m, n;
1554: auto exec = PetscGetKokkosExecutionSpace();
1556: PetscFunctionBegin;
1557: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1559: m = akok->nrows();
1560: n = akok->ncols();
1561: PetscCall(MatSetSizes(A, m, n, m, n));
1562: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1564: /* Set up data structures of A as a MATSEQAIJ */
1565: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1566: aseq = (Mat_SeqAIJ *)A->data;
1568: PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */
1569: PetscCallCXX(akok->j_dual.sync_host(exec));
1570: PetscCallCXX(exec.fence());
1572: aseq->i = akok->i_host_data();
1573: aseq->j = akok->j_host_data();
1574: aseq->a = akok->a_host_data();
1575: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1576: aseq->free_a = PETSC_FALSE;
1577: aseq->free_ij = PETSC_FALSE;
1578: aseq->nz = akok->nnz();
1579: aseq->maxnz = aseq->nz;
1581: PetscCall(PetscMalloc1(m, &aseq->imax));
1582: PetscCall(PetscMalloc1(m, &aseq->ilen));
1583: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1585: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1586: akok->nonzerostate = A->nonzerostate;
1587: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1588: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1589: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1590: PetscFunctionReturn(PETSC_SUCCESS);
1591: }
1593: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1594: {
1595: PetscFunctionBegin;
1596: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1597: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1598: PetscFunctionReturn(PETSC_SUCCESS);
1599: }
1601: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1602: {
1603: Mat_SeqAIJKokkos *akok;
1605: PetscFunctionBegin;
1606: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1607: PetscCall(MatCreate(comm, A));
1608: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1609: PetscFunctionReturn(PETSC_SUCCESS);
1610: }
1612: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1614: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1615: */
1616: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1617: {
1618: PetscFunctionBegin;
1619: PetscCall(MatCreate(comm, A));
1620: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1621: PetscFunctionReturn(PETSC_SUCCESS);
1622: }
1624: /*@C
1625: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1626: (the default parallel PETSc format). This matrix will ultimately be handled by
1627: Kokkos for calculations.
1629: Collective
1631: Input Parameters:
1632: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1633: . m - number of rows
1634: . n - number of columns
1635: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1636: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1638: Output Parameter:
1639: . A - the matrix
1641: Level: intermediate
1643: Notes:
1644: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1645: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1646: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1648: The AIJ format, also called
1649: compressed row storage, is fully compatible with standard Fortran
1650: storage. That is, the stored row and column indices can begin at
1651: either one (as in Fortran) or zero.
1653: Specify the preallocated storage with either `nz` or `nnz` (not both).
1654: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1655: allocation.
1657: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1658: @*/
1659: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1660: {
1661: PetscFunctionBegin;
1662: PetscCall(PetscKokkosInitializeCheck());
1663: PetscCall(MatCreate(comm, A));
1664: PetscCall(MatSetSizes(*A, m, n, m, n));
1665: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1666: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1667: PetscFunctionReturn(PETSC_SUCCESS);
1668: }
1670: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1671: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1672: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1673: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1674: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1675: {
1676: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1677: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1678: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1680: PetscFunctionBegin;
1681: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1682: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1683: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1684: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1685: }
1686: PetscFunctionReturn(PETSC_SUCCESS);
1687: }
1689: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1690: {
1691: const PetscInt n = A->rmap->n;
1692: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1693: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1694: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1696: PetscFunctionBegin;
1697: if (!factors->transpose_updated) {
1698: if (has_upper) {
1699: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1700: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1701: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1702: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1703: }
1705: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1706: if (!factors->U) {
1707: Mat_SeqAIJ *seq;
1709: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1710: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1712: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1713: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1714: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1715: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1716: } else {
1717: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1718: }
1719: // Copy Ut from host to device
1720: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1721: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1722: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1723: } else { // If U was computed on device, we also compute the transpose there
1724: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1725: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1726: factors->jU_d, factors->aU_d,
1727: factors->iUt_d, factors->jUt_d,
1728: factors->aUt_d));
1729: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1730: }
1731: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732: }
1734: // do the same for L with LU
1735: if (has_lower) {
1736: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1737: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1738: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1739: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1740: }
1742: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1743: if (!factors->L) {
1744: Mat_SeqAIJ *seq;
1746: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1747: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1749: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1750: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1751: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1752: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1753: } else {
1754: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1755: }
1756: // Copy Lt from host to device
1757: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1758: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1759: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1760: } else { // If L was computed on device, we also compute the transpose there
1761: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1762: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1763: factors->jL_d, factors->aL_d,
1764: factors->iLt_d, factors->jLt_d,
1765: factors->aLt_d));
1766: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1767: }
1768: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769: }
1771: factors->transpose_updated = PETSC_TRUE;
1772: }
1773: PetscFunctionReturn(PETSC_SUCCESS);
1774: }
1776: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1777: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1778: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1779: {
1780: auto exec = PetscGetKokkosExecutionSpace();
1781: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1782: PetscInt m = A->rmap->n;
1783: PetscScalarKokkosView D = factors->D_d;
1784: PetscScalarKokkosView X, Y, B; // alias
1785: ConstPetscScalarKokkosView b;
1786: PetscScalarKokkosView x;
1787: PetscIntKokkosView &rowperm = factors->rowperm;
1788: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1790: PetscFunctionBegin;
1791: PetscCall(PetscLogGpuTimeBegin());
1792: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1793: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1794: PetscCall(VecGetKokkosView(bb, &b));
1795: PetscCall(VecGetKokkosViewWrite(xx, &x));
1797: // Solve U^T Y = B
1798: if (identity) { // Reorder b with the row permutation
1799: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1800: Y = factors->workVector;
1801: } else {
1802: B = factors->workVector;
1803: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1804: Y = x;
1805: }
1806: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1808: // Solve diag(D) Y' = Y.
1809: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1810: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1812: // Solve UX = Y
1813: if (identity) {
1814: X = x;
1815: } else {
1816: X = factors->workVector; // B is not needed anymore
1817: }
1818: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1820: // Reorder X with the inverse column (row) permutation
1821: if (!identity) {
1822: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1823: }
1825: PetscCall(VecRestoreKokkosView(bb, &b));
1826: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1827: PetscCall(PetscLogGpuTimeEnd());
1828: PetscFunctionReturn(PETSC_SUCCESS);
1829: }
1831: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1832: // R and C are represented by rowperm and colperm in factors.
1833: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1834: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1835: {
1836: auto exec = PetscGetKokkosExecutionSpace();
1837: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1838: PetscInt m = A->rmap->n;
1839: PetscScalarKokkosView X, Y, B; // alias
1840: ConstPetscScalarKokkosView b;
1841: PetscScalarKokkosView x;
1842: PetscIntKokkosView &rowperm = factors->rowperm;
1843: PetscIntKokkosView &colperm = factors->colperm;
1844: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1847: PetscFunctionBegin;
1848: PetscCall(PetscLogGpuTimeBegin());
1849: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1850: PetscCall(VecGetKokkosView(bb, &b));
1851: PetscCall(VecGetKokkosViewWrite(xx, &x));
1853: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1854: if (row_identity) {
1855: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1856: Y = factors->workVector;
1857: } else {
1858: B = factors->workVector;
1859: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1860: Y = x;
1861: }
1862: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1864: // Solve U C^- x = Y
1865: if (col_identity) {
1866: X = x;
1867: } else {
1868: X = factors->workVector;
1869: }
1870: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1872: // x = C X; Reorder X with the inverse col permutation
1873: if (!col_identity) {
1874: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1875: }
1877: PetscCall(VecRestoreKokkosView(bb, &b));
1878: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1879: PetscCall(PetscLogGpuTimeEnd());
1880: PetscFunctionReturn(PETSC_SUCCESS);
1881: }
1883: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1884: // R and C are represented by rowperm and colperm in factors.
1885: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1886: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1887: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1888: {
1889: auto exec = PetscGetKokkosExecutionSpace();
1890: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1891: PetscInt m = A->rmap->n;
1892: PetscScalarKokkosView X, Y, B; // alias
1893: ConstPetscScalarKokkosView b;
1894: PetscScalarKokkosView x;
1895: PetscIntKokkosView &rowperm = factors->rowperm;
1896: PetscIntKokkosView &colperm = factors->colperm;
1897: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1898: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1900: PetscFunctionBegin;
1901: PetscCall(PetscLogGpuTimeBegin());
1902: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1903: PetscCall(VecGetKokkosView(bb, &b));
1904: PetscCall(VecGetKokkosViewWrite(xx, &x));
1906: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1907: if (col_identity) { // Reorder b with the col permutation
1908: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1909: Y = factors->workVector;
1910: } else {
1911: B = factors->workVector;
1912: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1913: Y = x;
1914: }
1915: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1917: // Solve L^T X = Y
1918: if (row_identity) {
1919: X = x;
1920: } else {
1921: X = factors->workVector;
1922: }
1923: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1925: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1926: if (!row_identity) {
1927: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1928: }
1930: PetscCall(VecRestoreKokkosView(bb, &b));
1931: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1932: PetscCall(PetscLogGpuTimeEnd());
1933: PetscFunctionReturn(PETSC_SUCCESS);
1934: }
1936: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1937: {
1938: PetscFunctionBegin;
1939: PetscCall(MatSeqAIJKokkosSyncHost(A));
1940: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1942: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1943: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1944: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1945: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1946: const MatScalar *Ba = b->a;
1947: PetscInt m = B->rmap->n, n = B->cmap->n;
1949: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1950: // Allocate memory and copy the structure
1951: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1952: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1953: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1954: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1955: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1956: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1958: PetscInt *Li = factors->iL_h.data();
1959: PetscInt *Lj = factors->jL_h.data();
1960: PetscInt *Ui = factors->iU_h.data();
1961: PetscInt *Uj = factors->jU_h.data();
1963: Li[0] = Ui[0] = 0;
1964: for (PetscInt i = 0; i < m; i++) {
1965: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1966: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1968: PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1969: Lj[Li[i] + llen] = i; // diagonal entry of L
1971: Uj[Ui[i]] = i; // diagonal entry of U
1972: PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal
1974: Li[i + 1] = Li[i] + llen + 1;
1975: Ui[i + 1] = Ui[i] + ulen;
1976: }
1978: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1979: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1980: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1981: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1982: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1983: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1985: // Copy row/col permutation to device
1986: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1987: PetscBool row_identity;
1988: PetscCall(ISIdentity(rowperm, &row_identity));
1989: if (!row_identity) {
1990: const PetscInt *ip;
1992: PetscCall(ISGetIndices(rowperm, &ip));
1993: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1994: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1995: PetscCall(ISRestoreIndices(rowperm, &ip));
1996: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1997: }
1999: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
2000: PetscBool col_identity;
2001: PetscCall(ISIdentity(colperm, &col_identity));
2002: if (!col_identity) {
2003: const PetscInt *ip;
2005: PetscCall(ISGetIndices(colperm, &ip));
2006: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2007: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2008: PetscCall(ISRestoreIndices(colperm, &ip));
2009: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2010: }
2012: /* Create sptrsv handles for L, U and their transpose */
2013: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2014: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2015: #else
2016: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2017: #endif
2018: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2019: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2020: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2021: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2022: }
2024: // Copy the value
2025: for (PetscInt i = 0; i < m; i++) {
2026: PetscInt llen = Bi[i + 1] - Bi[i];
2027: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2028: const PetscInt *Li = factors->iL_h.data();
2029: const PetscInt *Ui = factors->iU_h.data();
2031: PetscScalar *La = factors->aL_h.data();
2032: PetscScalar *Ua = factors->aU_h.data();
2034: PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2035: La[Li[i] + llen] = 1.0; // diagonal entry
2037: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2038: PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2039: }
2041: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2042: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2043: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2044: factors->transpose_updated = PETSC_FALSE;
2045: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2047: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2048: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2049: }
2051: B->ops->matsolve = NULL;
2052: B->ops->matsolvetranspose = NULL;
2053: PetscFunctionReturn(PETSC_SUCCESS);
2054: }
2056: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2057: {
2058: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2059: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2060: PetscInt fill_lev = info->levels;
2062: PetscFunctionBegin;
2063: PetscCall(PetscLogGpuTimeBegin());
2064: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2065: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2067: auto a_d = aijkok->a_dual.view_device();
2068: auto i_d = aijkok->i_dual.view_device();
2069: auto j_d = aijkok->j_dual.view_device();
2071: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2073: B->assembled = PETSC_TRUE;
2074: B->preallocated = PETSC_TRUE;
2075: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2076: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2077: B->ops->matsolve = NULL;
2078: B->ops->matsolvetranspose = NULL;
2080: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2081: factors->transpose_updated = PETSC_FALSE;
2082: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2083: /* TODO: log flops, but how to know that? */
2084: PetscCall(PetscLogGpuTimeEnd());
2085: PetscFunctionReturn(PETSC_SUCCESS);
2086: }
2088: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2089: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2090: {
2091: Mat_SeqAIJKokkos *aijkok;
2092: Mat_SeqAIJ *b;
2093: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2094: PetscInt fill_lev = info->levels;
2095: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2096: PetscInt n = A->rmap->n;
2098: PetscFunctionBegin;
2099: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2100: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2102: /* Create a spiluk handle and then do symbolic factorization */
2103: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2104: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2106: auto spiluk_handle = factors->kh.get_spiluk_handle();
2108: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2109: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2110: Kokkos::realloc(factors->iU_d, n + 1);
2111: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2113: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2114: auto i_d = aijkok->i_dual.view_device();
2115: auto j_d = aijkok->j_dual.view_device();
2116: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2117: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2119: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2120: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2121: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2122: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2124: /* TODO: add options to select sptrsv algorithms */
2125: /* Create sptrsv handles for L, U and their transpose */
2126: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2127: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2128: #else
2129: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2130: #endif
2132: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2133: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2134: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2135: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2137: /* Fill fields of the factor matrix B */
2138: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2139: b = (Mat_SeqAIJ *)B->data;
2140: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2141: B->info.fill_ratio_given = info->fill;
2142: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2144: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2145: PetscFunctionReturn(PETSC_SUCCESS);
2146: }
2148: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2149: {
2150: PetscFunctionBegin;
2151: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2152: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2153: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2154: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2155: PetscFunctionReturn(PETSC_SUCCESS);
2156: }
2158: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2159: {
2160: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2162: PetscFunctionBegin;
2163: if (!info->factoronhost) {
2164: PetscCall(ISIdentity(isrow, &row_identity));
2165: PetscCall(ISIdentity(iscol, &col_identity));
2166: }
2168: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2169: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2171: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2172: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2173: } else {
2174: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2175: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2176: }
2177: PetscFunctionReturn(PETSC_SUCCESS);
2178: }
2180: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2181: {
2182: PetscFunctionBegin;
2183: PetscCall(MatSeqAIJKokkosSyncHost(A));
2184: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2186: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2187: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2188: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2189: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2190: const MatScalar *Ba = b->a;
2191: PetscInt m = B->rmap->n;
2193: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2194: // Allocate memory and copy the structure
2195: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2196: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2197: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2198: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2199: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2200: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2202: // Build jU_h from the skewed Aj
2203: PetscInt *Uj = factors->jU_h.data();
2204: for (PetscInt i = 0; i < m; i++) {
2205: PetscInt ulen = Bi[i + 1] - Bi[i];
2206: Uj[Bi[i]] = i; // diagonal entry
2207: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2208: }
2210: // Copy iU, jU to device
2211: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2212: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2214: // Copy row/col permutation to device
2215: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2216: PetscBool row_identity;
2217: PetscCall(ISIdentity(rowperm, &row_identity));
2218: if (!row_identity) {
2219: const PetscInt *ip;
2221: PetscCall(ISGetIndices(rowperm, &ip));
2222: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2223: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2224: PetscCall(ISRestoreIndices(rowperm, &ip));
2225: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2226: }
2228: // Create sptrsv handles for U and U^T
2229: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2230: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2231: #else
2232: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2233: #endif
2234: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2235: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2236: }
2237: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2238: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2239: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2241: // Copy the value
2242: PetscScalar *Ua = factors->aU_h.data();
2243: PetscScalar *D = factors->D_h.data();
2244: for (PetscInt i = 0; i < m; i++) {
2245: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2246: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2247: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2248: }
2249: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2250: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2252: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2253: factors->transpose_updated = PETSC_FALSE;
2254: }
2256: B->ops->matsolve = NULL;
2257: B->ops->matsolvetranspose = NULL;
2258: PetscFunctionReturn(PETSC_SUCCESS);
2259: }
2261: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2262: {
2263: PetscFunctionBegin;
2264: if (info->solveonhost) {
2265: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2266: PetscCall(MatSetType(B, MATSEQSBAIJ));
2267: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2268: }
2270: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2272: if (!info->solveonhost) {
2273: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2274: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2275: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2276: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2277: }
2278: PetscFunctionReturn(PETSC_SUCCESS);
2279: }
2281: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2282: {
2283: PetscFunctionBegin;
2284: if (info->solveonhost) {
2285: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2286: PetscCall(MatSetType(B, MATSEQSBAIJ));
2287: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2288: }
2290: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2292: if (!info->solveonhost) {
2293: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2294: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2295: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2296: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2297: }
2298: PetscFunctionReturn(PETSC_SUCCESS);
2299: }
2301: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2302: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2303: {
2304: PetscFunctionBegin;
2305: *type = MATSOLVERKOKKOS;
2306: PetscFunctionReturn(PETSC_SUCCESS);
2307: }
2309: /*MC
2310: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2311: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2313: Level: beginner
2315: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2316: M*/
2317: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2318: {
2319: PetscInt n = A->rmap->n;
2320: MPI_Comm comm;
2322: PetscFunctionBegin;
2323: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2324: PetscCall(MatCreate(comm, B));
2325: PetscCall(MatSetSizes(*B, n, n, n, n));
2326: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2327: (*B)->factortype = ftype;
2328: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2329: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2330: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2332: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2333: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2334: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2335: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2336: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2337: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2338: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2339: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2340: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2341: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2342: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2343: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2345: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2346: (*B)->canuseordering = PETSC_TRUE;
2347: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2348: PetscFunctionReturn(PETSC_SUCCESS);
2349: }
2351: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2352: {
2353: PetscFunctionBegin;
2354: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2355: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2356: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2357: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2358: PetscFunctionReturn(PETSC_SUCCESS);
2359: }
2361: /* Utility to print out a KokkosCsrMatrix for debugging */
2362: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2363: {
2364: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2365: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2366: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2367: const PetscInt *i = iv.data();
2368: const PetscInt *j = jv.data();
2369: const PetscScalar *a = av.data();
2370: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2372: PetscFunctionBegin;
2373: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2374: for (PetscInt k = 0; k < m; k++) {
2375: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2376: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2377: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2378: }
2379: PetscFunctionReturn(PETSC_SUCCESS);
2380: }