Actual source code: aijkok.kokkos.cxx
1: #include <petsc_kokkos.hpp>
2: #include <petscvec_kokkos.hpp>
3: #include <petscmat_kokkos.hpp>
4: #include <petscpkg_version.h>
5: #include <petsc/private/petscimpl.h>
6: #include <petsc/private/sfimpl.h>
7: #include <petsc/private/kokkosimpl.hpp>
8: #include <petscsys.h>
10: #include <Kokkos_Core.hpp>
11: #include <KokkosBlas.hpp>
12: #include <KokkosSparse_CrsMatrix.hpp>
14: // To suppress compiler warnings:
15: // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16: // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17: // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18: // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
20: #include <KokkosSparse_spmv.hpp>
21: PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
23: #include <KokkosSparse_spiluk.hpp>
24: #include <KokkosSparse_sptrsv.hpp>
25: #include <KokkosSparse_spgemm.hpp>
26: #include <KokkosSparse_spadd.hpp>
27: #include <KokkosBatched_LU_Decl.hpp>
28: #include <KokkosBatched_InverseLU_Decl.hpp>
30: #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
32: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
33: #include <KokkosSparse_Utils.hpp>
34: using KokkosSparse::sort_crs_matrix;
35: using KokkosSparse::Impl::transpose_matrix;
36: #else
37: #include <KokkosKernels_Sorting.hpp>
38: using KokkosKernels::sort_crs_matrix;
39: using KokkosKernels::Impl::transpose_matrix;
40: #endif
42: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
43: using KokkosSparse::spiluk_symbolic;
44: using KokkosSparse::spiluk_numeric;
45: using KokkosSparse::sptrsv_symbolic;
46: using KokkosSparse::sptrsv_solve;
47: using KokkosSparse::Experimental::SPTRSVAlgorithm;
48: using KokkosSparse::Experimental::SPILUKAlgorithm;
49: #else
50: using KokkosSparse::Experimental::spiluk_symbolic;
51: using KokkosSparse::Experimental::spiluk_numeric;
52: using KokkosSparse::Experimental::sptrsv_symbolic;
53: using KokkosSparse::Experimental::sptrsv_solve;
54: using KokkosSparse::Experimental::SPTRSVAlgorithm;
55: using KokkosSparse::Experimental::SPILUKAlgorithm;
56: #endif
58: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
60: /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
61: we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
62: In the latter case, it is important to set a_dual's sync state correctly.
63: */
64: static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
65: {
66: Mat_SeqAIJ *aijseq;
67: Mat_SeqAIJKokkos *aijkok;
69: PetscFunctionBegin;
70: if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
71: PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
73: aijseq = static_cast<Mat_SeqAIJ *>(A->data);
74: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
76: /* If aijkok does not exist, we just copy i, j to device.
77: If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
78: In both cases, we build a new aijkok structure.
79: */
80: if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
81: if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
82: PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
83: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
84: PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
85: PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
86: PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
87: PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
88: aijseq->free_a = PETSC_TRUE;
89: aijseq->free_ij = PETSC_TRUE;
90: /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
91: we have the CsrMatrix on device first and then copy to host, followed by
92: MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
93: One could improve it by not using NULL garray.
94: */
95: }
96: delete aijkok;
97: aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/);
98: A->spptr = aijkok;
99: } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
100: MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
101: auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
102: aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
103: }
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }
107: /* Sync CSR data to device if not yet */
108: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
109: {
110: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
112: PetscFunctionBegin;
113: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
114: PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
115: if (aijkok->a_dual.need_sync_device()) {
116: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
117: aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
118: aijkok->hermitian_updated = PETSC_FALSE;
119: }
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /* Mark the CSR data on device as modified */
124: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
125: {
126: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
128: PetscFunctionBegin;
129: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
130: aijkok->a_dual.clear_sync_state();
131: aijkok->a_dual.modify_device();
132: aijkok->transpose_updated = PETSC_FALSE;
133: aijkok->hermitian_updated = PETSC_FALSE;
134: PetscCall(MatSeqAIJInvalidateDiagonal(A));
135: PetscCall(PetscObjectStateIncrease((PetscObject)A));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
140: {
141: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
142: auto exec = PetscGetKokkosExecutionSpace();
144: PetscFunctionBegin;
145: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
146: /* We do not expect one needs factors on host */
147: PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
148: PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
149: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
150: PetscFunctionReturn(PETSC_SUCCESS);
151: }
153: static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
154: {
155: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157: PetscFunctionBegin;
158: /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
159: Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
160: reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
161: must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
162: */
163: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
164: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
165: *array = aijkok->a_dual.view_host().data();
166: } else { /* Happens when calling MatSetValues on a newly created matrix */
167: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
168: }
169: PetscFunctionReturn(PETSC_SUCCESS);
170: }
172: static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
173: {
174: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176: PetscFunctionBegin;
177: if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
178: PetscFunctionReturn(PETSC_SUCCESS);
179: }
181: static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
182: {
183: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185: PetscFunctionBegin;
186: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
187: PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
188: *array = aijkok->a_dual.view_host().data();
189: } else {
190: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
191: }
192: PetscFunctionReturn(PETSC_SUCCESS);
193: }
195: static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
196: {
197: PetscFunctionBegin;
198: *array = NULL;
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
203: {
204: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206: PetscFunctionBegin;
207: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
208: *array = aijkok->a_dual.view_host().data();
209: } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
210: *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
211: }
212: PetscFunctionReturn(PETSC_SUCCESS);
213: }
215: static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
216: {
217: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219: PetscFunctionBegin;
220: if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
221: aijkok->a_dual.clear_sync_state();
222: aijkok->a_dual.modify_host();
223: }
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
228: {
229: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231: PetscFunctionBegin;
232: PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234: if (i) *i = aijkok->i_device_data();
235: if (j) *j = aijkok->j_device_data();
236: if (a) {
237: PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
238: *a = aijkok->a_device_data();
239: }
240: if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
241: PetscFunctionReturn(PETSC_SUCCESS);
242: }
244: /*
245: Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
247: Input Parameter:
248: . A - the MATSEQAIJKOKKOS matrix
250: Output Parameters:
251: + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
252: - T_d - the transpose on device, whose value array is allocated but not initialized
253: */
254: static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
255: {
256: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
257: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
258: const PetscInt *Ai = aseq->i, *Aj = aseq->j;
259: MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
260: MatRowMapType *Ti = Ti_h.data();
261: MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
262: MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
263: PetscInt *Tj = Tj_h.data();
264: PetscInt *perm = perm_h.data();
265: PetscInt *offset;
267: PetscFunctionBegin;
268: // Populate Ti
269: PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
270: Ti++;
271: for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
272: Ti--;
273: for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
275: // Populate Tj and the permutation array
276: PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
277: for (PetscInt i = 0; i < m; i++) {
278: for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
279: PetscInt r = Aj[j]; // row r of T
280: PetscInt disp = Ti[r] + offset[r];
282: Tj[disp] = i; // col i of T
283: perm[disp] = j;
284: offset[r]++;
285: }
286: }
287: PetscCall(PetscFree(offset));
289: // Sort each row of T, along with the permutation array
290: for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
292: // Output perm and T on device
293: auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
294: auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
295: PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
296: PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
297: PetscFunctionReturn(PETSC_SUCCESS);
298: }
300: // Generate the transpose on device and cache it internally
301: // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
302: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
303: {
304: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
305: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
306: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
307: KokkosCsrMatrix &T = akok->csrmatT;
309: PetscFunctionBegin;
310: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
311: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
313: const auto &Aa = akok->a_dual.view_device();
315: if (A->symmetric == PETSC_BOOL3_TRUE) {
316: *csrmatT = akok->csrmat;
317: } else {
318: // See if we already have a cached transpose and its value is up to date
319: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
320: if (!akok->transpose_updated) { // if the value is out of date, update the cached version
321: const auto &perm = akok->transpose_perm; // get the permutation array
322: auto &Ta = T.values;
324: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
325: }
326: } else { // Generate T of size n x m for the first time
327: MatRowMapKokkosView perm;
329: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
330: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
331: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
332: }
333: akok->transpose_updated = PETSC_TRUE;
334: *csrmatT = akok->csrmatT;
335: }
336: PetscFunctionReturn(PETSC_SUCCESS);
337: }
339: // Generate the Hermitian on device and cache it internally
340: static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
341: {
342: Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
343: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
344: PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
345: KokkosCsrMatrix &T = akok->csrmatH;
347: PetscFunctionBegin;
348: PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
349: PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
351: const auto &Aa = akok->a_dual.view_device();
353: if (A->hermitian == PETSC_BOOL3_TRUE) {
354: *csrmatH = akok->csrmat;
355: } else {
356: // See if we already have a cached hermitian and its value is up to date
357: if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
358: if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
359: const auto &perm = akok->transpose_perm; // get the permutation array
360: auto &Ta = T.values;
362: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
363: }
364: } else { // Generate T of size n x m for the first time
365: MatRowMapKokkosView perm;
367: PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
368: akok->transpose_perm = perm; // cache the perm in this matrix for reuse
369: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
370: }
371: akok->hermitian_updated = PETSC_TRUE;
372: *csrmatH = akok->csrmatH;
373: }
374: PetscFunctionReturn(PETSC_SUCCESS);
375: }
377: /* y = A x */
378: static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
379: {
380: Mat_SeqAIJKokkos *aijkok;
381: ConstPetscScalarKokkosView xv;
382: PetscScalarKokkosView yv;
384: PetscFunctionBegin;
385: PetscCall(PetscLogGpuTimeBegin());
386: PetscCall(MatSeqAIJKokkosSyncDevice(A));
387: PetscCall(VecGetKokkosView(xx, &xv));
388: PetscCall(VecGetKokkosViewWrite(yy, &yv));
389: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
390: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
391: PetscCall(VecRestoreKokkosView(xx, &xv));
392: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
393: /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
394: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
395: PetscCall(PetscLogGpuTimeEnd());
396: PetscFunctionReturn(PETSC_SUCCESS);
397: }
399: /* y = A^T x */
400: static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
401: {
402: Mat_SeqAIJKokkos *aijkok;
403: const char *mode;
404: ConstPetscScalarKokkosView xv;
405: PetscScalarKokkosView yv;
406: KokkosCsrMatrix csrmat;
408: PetscFunctionBegin;
409: PetscCall(PetscLogGpuTimeBegin());
410: PetscCall(MatSeqAIJKokkosSyncDevice(A));
411: PetscCall(VecGetKokkosView(xx, &xv));
412: PetscCall(VecGetKokkosViewWrite(yy, &yv));
413: if (A->form_explicit_transpose) {
414: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
415: mode = "N";
416: } else {
417: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
418: csrmat = aijkok->csrmat;
419: mode = "T";
420: }
421: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
422: PetscCall(VecRestoreKokkosView(xx, &xv));
423: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
424: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
425: PetscCall(PetscLogGpuTimeEnd());
426: PetscFunctionReturn(PETSC_SUCCESS);
427: }
429: /* y = A^H x */
430: static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
431: {
432: Mat_SeqAIJKokkos *aijkok;
433: const char *mode;
434: ConstPetscScalarKokkosView xv;
435: PetscScalarKokkosView yv;
436: KokkosCsrMatrix csrmat;
438: PetscFunctionBegin;
439: PetscCall(PetscLogGpuTimeBegin());
440: PetscCall(MatSeqAIJKokkosSyncDevice(A));
441: PetscCall(VecGetKokkosView(xx, &xv));
442: PetscCall(VecGetKokkosViewWrite(yy, &yv));
443: if (A->form_explicit_transpose) {
444: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
445: mode = "N";
446: } else {
447: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
448: csrmat = aijkok->csrmat;
449: mode = "C";
450: }
451: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
452: PetscCall(VecRestoreKokkosView(xx, &xv));
453: PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
454: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
455: PetscCall(PetscLogGpuTimeEnd());
456: PetscFunctionReturn(PETSC_SUCCESS);
457: }
459: /* z = A x + y */
460: static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462: Mat_SeqAIJKokkos *aijkok;
463: ConstPetscScalarKokkosView xv;
464: PetscScalarKokkosView zv;
466: PetscFunctionBegin;
467: PetscCall(PetscLogGpuTimeBegin());
468: PetscCall(MatSeqAIJKokkosSyncDevice(A));
469: if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
470: PetscCall(VecGetKokkosView(xx, &xv));
471: PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
472: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
473: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
474: PetscCall(VecRestoreKokkosView(xx, &xv));
475: PetscCall(VecRestoreKokkosView(zz, &zv));
476: PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
477: PetscCall(PetscLogGpuTimeEnd());
478: PetscFunctionReturn(PETSC_SUCCESS);
479: }
481: /* z = A^T x + y */
482: static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
483: {
484: Mat_SeqAIJKokkos *aijkok;
485: const char *mode;
486: ConstPetscScalarKokkosView xv;
487: PetscScalarKokkosView zv;
488: KokkosCsrMatrix csrmat;
490: PetscFunctionBegin;
491: PetscCall(PetscLogGpuTimeBegin());
492: PetscCall(MatSeqAIJKokkosSyncDevice(A));
493: if (zz != yy) PetscCall(VecCopy(yy, zz));
494: PetscCall(VecGetKokkosView(xx, &xv));
495: PetscCall(VecGetKokkosView(zz, &zv));
496: if (A->form_explicit_transpose) {
497: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
498: mode = "N";
499: } else {
500: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
501: csrmat = aijkok->csrmat;
502: mode = "T";
503: }
504: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
505: PetscCall(VecRestoreKokkosView(xx, &xv));
506: PetscCall(VecRestoreKokkosView(zz, &zv));
507: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
508: PetscCall(PetscLogGpuTimeEnd());
509: PetscFunctionReturn(PETSC_SUCCESS);
510: }
512: /* z = A^H x + y */
513: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
514: {
515: Mat_SeqAIJKokkos *aijkok;
516: const char *mode;
517: ConstPetscScalarKokkosView xv;
518: PetscScalarKokkosView zv;
519: KokkosCsrMatrix csrmat;
521: PetscFunctionBegin;
522: PetscCall(PetscLogGpuTimeBegin());
523: PetscCall(MatSeqAIJKokkosSyncDevice(A));
524: if (zz != yy) PetscCall(VecCopy(yy, zz));
525: PetscCall(VecGetKokkosView(xx, &xv));
526: PetscCall(VecGetKokkosView(zz, &zv));
527: if (A->form_explicit_transpose) {
528: PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
529: mode = "N";
530: } else {
531: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
532: csrmat = aijkok->csrmat;
533: mode = "C";
534: }
535: PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
536: PetscCall(VecRestoreKokkosView(xx, &xv));
537: PetscCall(VecRestoreKokkosView(zz, &zv));
538: PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
539: PetscCall(PetscLogGpuTimeEnd());
540: PetscFunctionReturn(PETSC_SUCCESS);
541: }
543: static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
544: {
545: Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
547: PetscFunctionBegin;
548: switch (op) {
549: case MAT_FORM_EXPLICIT_TRANSPOSE:
550: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
551: if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
552: A->form_explicit_transpose = flg;
553: break;
554: default:
555: PetscCall(MatSetOption_SeqAIJ(A, op, flg));
556: break;
557: }
558: PetscFunctionReturn(PETSC_SUCCESS);
559: }
561: /* Depending on reuse, either build a new mat, or use the existing mat */
562: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
563: {
564: Mat_SeqAIJ *aseq;
566: PetscFunctionBegin;
567: PetscCall(PetscKokkosInitializeCheck());
568: if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
569: PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
570: PetscCall(MatSetType(*newmat, mtype));
571: } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
572: PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
573: } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
574: PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
575: PetscCall(PetscFree(A->defaultvectype));
576: PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
577: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
578: PetscCall(MatSetOps_SeqAIJKokkos(A));
579: aseq = static_cast<Mat_SeqAIJ *>(A->data);
580: if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
581: PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
582: A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
583: }
584: }
585: PetscFunctionReturn(PETSC_SUCCESS);
586: }
588: /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
589: an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
590: */
591: static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
592: {
593: Mat_SeqAIJ *bseq;
594: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
595: Mat mat;
597: PetscFunctionBegin;
598: /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
599: PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
600: mat = *B;
601: if (A->assembled) {
602: bseq = static_cast<Mat_SeqAIJ *>(mat->data);
603: bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
604: bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
605: /* Now copy values to B if needed */
606: if (dupOption == MAT_COPY_VALUES) {
607: if (akok->a_dual.need_sync_device()) {
608: Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
609: bkok->a_dual.modify_host();
610: } else { /* If device has the latest data, we only copy data on device */
611: Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
612: bkok->a_dual.modify_device();
613: }
614: } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
615: /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
616: bkok->a_dual.modify_host();
617: }
618: mat->spptr = bkok;
619: }
621: PetscCall(PetscFree(mat->defaultvectype));
622: PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
623: PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
624: PetscCall(MatSetOps_SeqAIJKokkos(mat));
625: PetscFunctionReturn(PETSC_SUCCESS);
626: }
628: static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
629: {
630: Mat At;
631: KokkosCsrMatrix internT;
632: Mat_SeqAIJKokkos *atkok, *bkok;
634: PetscFunctionBegin;
635: if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
636: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
637: if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
638: /* Deep copy internT, as we want to isolate the internal transpose */
639: PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
640: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
641: if (reuse == MAT_INITIAL_MATRIX) *B = At;
642: else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
643: } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
644: if ((*B)->assembled) {
645: bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
646: PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
647: PetscCall(MatSeqAIJKokkosModifyDevice(*B));
648: } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
649: Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
650: MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
651: MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
652: PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
653: PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
654: } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
655: }
656: PetscFunctionReturn(PETSC_SUCCESS);
657: }
659: static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
660: {
661: Mat_SeqAIJKokkos *aijkok;
663: PetscFunctionBegin;
664: if (A->factortype == MAT_FACTOR_NONE) {
665: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
666: delete aijkok;
667: } else {
668: delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
669: }
670: A->spptr = NULL;
671: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
672: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
673: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
674: #if defined(PETSC_HAVE_HYPRE)
675: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
676: #endif
677: PetscCall(MatDestroy_SeqAIJ(A));
678: PetscFunctionReturn(PETSC_SUCCESS);
679: }
681: /*MC
682: MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
684: A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
686: Options Database Key:
687: . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
689: Level: beginner
691: .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
692: M*/
693: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
694: {
695: PetscFunctionBegin;
696: PetscCall(PetscKokkosInitializeCheck());
697: PetscCall(MatCreate_SeqAIJ(A));
698: PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
699: PetscFunctionReturn(PETSC_SUCCESS);
700: }
702: /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
703: PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
704: {
705: Mat_SeqAIJ *a, *b;
706: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
707: MatScalarKokkosView aa, ba, ca;
708: MatRowMapKokkosView ai, bi, ci;
709: MatColIdxKokkosView aj, bj, cj;
710: PetscInt m, n, nnz, aN;
712: PetscFunctionBegin;
715: PetscAssertPointer(C, 4);
716: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
717: PetscCheckTypeName(B, MATSEQAIJKOKKOS);
718: PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
719: PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
721: PetscCall(MatSeqAIJKokkosSyncDevice(A));
722: PetscCall(MatSeqAIJKokkosSyncDevice(B));
723: a = static_cast<Mat_SeqAIJ *>(A->data);
724: b = static_cast<Mat_SeqAIJ *>(B->data);
725: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
726: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
727: aa = akok->a_dual.view_device();
728: ai = akok->i_dual.view_device();
729: ba = bkok->a_dual.view_device();
730: bi = bkok->i_dual.view_device();
731: m = A->rmap->n; /* M, N and nnz of C */
732: n = A->cmap->n + B->cmap->n;
733: nnz = a->nz + b->nz;
734: aN = A->cmap->n; /* N of A */
735: if (reuse == MAT_INITIAL_MATRIX) {
736: aj = akok->j_dual.view_device();
737: bj = bkok->j_dual.view_device();
738: auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
739: auto ci = MatRowMapKokkosView("i", ai.extent(0));
740: auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
742: /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
743: Kokkos::parallel_for(
744: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
745: PetscInt i = t.league_rank(); /* row i */
746: PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
748: Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
749: ci(i) = coffset;
750: if (i == m - 1) ci(m) = ai(m) + bi(m);
751: });
753: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
754: if (k < alen) {
755: ca(coffset + k) = aa(ai(i) + k);
756: cj(coffset + k) = aj(ai(i) + k);
757: } else {
758: ca(coffset + k) = ba(bi(i) + k - alen);
759: cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
760: }
761: });
762: });
763: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
764: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
765: } else if (reuse == MAT_REUSE_MATRIX) {
767: PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
768: ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
769: ca = ckok->a_dual.view_device();
770: ci = ckok->i_dual.view_device();
772: Kokkos::parallel_for(
773: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
774: PetscInt i = t.league_rank(); /* row i */
775: PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
776: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
777: if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
778: else ca(ci(i) + k) = ba(bi(i) + k - alen);
779: });
780: });
781: PetscCall(MatSeqAIJKokkosModifyDevice(*C));
782: }
783: PetscFunctionReturn(PETSC_SUCCESS);
784: }
786: static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata)
787: {
788: PetscFunctionBegin;
789: delete static_cast<MatProductData_SeqAIJKokkos *>(pdata);
790: PetscFunctionReturn(PETSC_SUCCESS);
791: }
793: static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
794: {
795: Mat_Product *product = C->product;
796: Mat A, B;
797: bool transA, transB; /* use bool, since KK needs this type */
798: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
799: Mat_SeqAIJ *c;
800: MatProductData_SeqAIJKokkos *pdata;
801: KokkosCsrMatrix csrmatA, csrmatB;
803: PetscFunctionBegin;
804: MatCheckProduct(C, 1);
805: PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
806: pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data);
808: // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
809: // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
810: // we still do numeric.
811: if (pdata->reusesym) { // numeric reuses results from symbolic
812: pdata->reusesym = PETSC_FALSE;
813: PetscFunctionReturn(PETSC_SUCCESS);
814: }
816: switch (product->type) {
817: case MATPRODUCT_AB:
818: transA = false;
819: transB = false;
820: break;
821: case MATPRODUCT_AtB:
822: transA = true;
823: transB = false;
824: break;
825: case MATPRODUCT_ABt:
826: transA = false;
827: transB = true;
828: break;
829: default:
830: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
831: }
833: A = product->A;
834: B = product->B;
835: PetscCall(MatSeqAIJKokkosSyncDevice(A));
836: PetscCall(MatSeqAIJKokkosSyncDevice(B));
837: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
838: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
839: ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
841: PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
843: csrmatA = akok->csrmat;
844: csrmatB = bkok->csrmat;
846: /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
847: if (transA) {
848: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
849: transA = false;
850: }
852: if (transB) {
853: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
854: transB = false;
855: }
856: PetscCall(PetscLogGpuTimeBegin());
857: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
858: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
859: auto spgemmHandle = pdata->kh.get_spgemm_handle();
860: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
861: #endif
863: PetscCall(PetscLogGpuTimeEnd());
864: PetscCall(MatSeqAIJKokkosModifyDevice(C));
865: /* shorter version of MatAssemblyEnd_SeqAIJ */
866: c = (Mat_SeqAIJ *)C->data;
867: PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
868: PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
869: PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
870: c->reallocs = 0;
871: C->info.mallocs = 0;
872: C->info.nz_unneeded = 0;
873: C->assembled = C->was_assembled = PETSC_TRUE;
874: C->num_ass++;
875: PetscFunctionReturn(PETSC_SUCCESS);
876: }
878: static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
879: {
880: Mat_Product *product = C->product;
881: MatProductType ptype;
882: Mat A, B;
883: bool transA, transB;
884: Mat_SeqAIJKokkos *akok, *bkok, *ckok;
885: MatProductData_SeqAIJKokkos *pdata;
886: MPI_Comm comm;
887: KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
889: PetscFunctionBegin;
890: MatCheckProduct(C, 1);
891: PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
892: PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
893: A = product->A;
894: B = product->B;
895: PetscCall(MatSeqAIJKokkosSyncDevice(A));
896: PetscCall(MatSeqAIJKokkosSyncDevice(B));
897: akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
898: bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
899: csrmatA = akok->csrmat;
900: csrmatB = bkok->csrmat;
902: ptype = product->type;
903: // Take advantage of the symmetry if true
904: if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
905: ptype = MATPRODUCT_AB;
906: product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
907: }
908: if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
909: ptype = MATPRODUCT_AB;
910: product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
911: }
913: switch (ptype) {
914: case MATPRODUCT_AB:
915: transA = false;
916: transB = false;
917: PetscCall(MatSetBlockSizesFromMats(C, A, B));
918: break;
919: case MATPRODUCT_AtB:
920: transA = true;
921: transB = false;
922: if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
923: if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
924: break;
925: case MATPRODUCT_ABt:
926: transA = false;
927: transB = true;
928: if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
929: if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
930: break;
931: default:
932: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
933: }
934: PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos());
935: pdata->reusesym = product->api_user;
937: /* TODO: add command line options to select spgemm algorithms */
938: auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
940: /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
941: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
942: #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
943: spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
944: #endif
945: #endif
946: PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
948: PetscCall(PetscLogGpuTimeBegin());
949: /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
950: if (transA) {
951: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
952: transA = false;
953: }
955: if (transB) {
956: PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
957: transB = false;
958: }
960: PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
961: /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
962: So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
963: calling new Mat_SeqAIJKokkos().
964: TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
965: */
966: PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
967: #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
968: /* Query if KK outputs a sorted matrix. If not, we need to sort it */
969: auto spgemmHandle = pdata->kh.get_spgemm_handle();
970: if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
971: #endif
972: PetscCall(PetscLogGpuTimeEnd());
974: PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
975: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
976: C->product->destroy = MatProductDataDestroy_SeqAIJKokkos;
977: PetscFunctionReturn(PETSC_SUCCESS);
978: }
980: /* handles sparse matrix matrix ops */
981: static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
982: {
983: Mat_Product *product = mat->product;
984: PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
986: PetscFunctionBegin;
987: MatCheckProduct(mat, 1);
988: PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
989: if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
990: if (Biskok && Ciskok) {
991: switch (product->type) {
992: case MATPRODUCT_AB:
993: case MATPRODUCT_AtB:
994: case MATPRODUCT_ABt:
995: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
996: break;
997: case MATPRODUCT_PtAP:
998: case MATPRODUCT_RARt:
999: case MATPRODUCT_ABC:
1000: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1001: break;
1002: default:
1003: break;
1004: }
1005: } else { /* fallback for AIJ */
1006: PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1007: }
1008: PetscFunctionReturn(PETSC_SUCCESS);
1009: }
1011: static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1012: {
1013: Mat_SeqAIJKokkos *aijkok;
1015: PetscFunctionBegin;
1016: PetscCall(PetscLogGpuTimeBegin());
1017: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1018: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1019: KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1020: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1021: PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1022: PetscCall(PetscLogGpuTimeEnd());
1023: PetscFunctionReturn(PETSC_SUCCESS);
1024: }
1026: // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
1027: static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1028: {
1029: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1031: PetscFunctionBegin;
1032: if (A->assembled && aijseq->diagonaldense) { // no missing diagonals
1033: PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1035: PetscCall(PetscLogGpuTimeBegin());
1036: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1037: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1038: const auto &Aa = aijkok->a_dual.view_device();
1039: const auto &Adiag = aijkok->diag_dual.view_device();
1040: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1041: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1042: PetscCall(PetscLogGpuFlops(n));
1043: PetscCall(PetscLogGpuTimeEnd());
1044: } else { // need reassembly, very slow!
1045: PetscCall(MatShift_Basic(A, a));
1046: }
1047: PetscFunctionReturn(PETSC_SUCCESS);
1048: }
1050: static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1051: {
1052: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1054: PetscFunctionBegin;
1055: if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals
1056: ConstPetscScalarKokkosView dv;
1057: PetscInt n, nv;
1059: PetscCall(PetscLogGpuTimeBegin());
1060: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1061: PetscCall(VecGetKokkosView(D, &dv));
1062: PetscCall(VecGetLocalSize(D, &nv));
1063: n = PetscMin(Y->rmap->n, Y->cmap->n);
1064: PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1066: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1067: const auto &Aa = aijkok->a_dual.view_device();
1068: const auto &Adiag = aijkok->diag_dual.view_device();
1069: PetscCallCXX(Kokkos::parallel_for(
1070: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1071: if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1072: else Aa(Adiag(i)) += dv(i);
1073: }));
1074: PetscCall(VecRestoreKokkosView(D, &dv));
1075: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1076: PetscCall(PetscLogGpuFlops(n));
1077: PetscCall(PetscLogGpuTimeEnd());
1078: } else { // need reassembly, very slow!
1079: PetscCall(MatDiagonalSet_Default(Y, D, is));
1080: }
1081: PetscFunctionReturn(PETSC_SUCCESS);
1082: }
1084: static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1085: {
1086: Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1087: PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1088: ConstPetscScalarKokkosView lv, rv;
1090: PetscFunctionBegin;
1091: PetscCall(PetscLogGpuTimeBegin());
1092: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1093: const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1094: const auto &Aa = aijkok->a_dual.view_device();
1095: const auto &Ai = aijkok->i_dual.view_device();
1096: const auto &Aj = aijkok->j_dual.view_device();
1097: if (ll) {
1098: PetscCall(VecGetLocalSize(ll, &m));
1099: PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1100: PetscCall(VecGetKokkosView(ll, &lv));
1101: PetscCallCXX(Kokkos::parallel_for( // for each row
1102: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1103: PetscInt i = t.league_rank(); // row i
1104: PetscInt len = Ai(i + 1) - Ai(i);
1105: // scale entries on the row
1106: Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1107: }));
1108: PetscCall(VecRestoreKokkosView(ll, &lv));
1109: PetscCall(PetscLogGpuFlops(nz));
1110: }
1111: if (rr) {
1112: PetscCall(VecGetLocalSize(rr, &n));
1113: PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1114: PetscCall(VecGetKokkosView(rr, &rv));
1115: PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1116: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1117: PetscCall(VecRestoreKokkosView(rr, &lv));
1118: PetscCall(PetscLogGpuFlops(nz));
1119: }
1120: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1121: PetscCall(PetscLogGpuTimeEnd());
1122: PetscFunctionReturn(PETSC_SUCCESS);
1123: }
1125: static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1126: {
1127: Mat_SeqAIJKokkos *aijkok;
1129: PetscFunctionBegin;
1130: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1131: if (aijkok) { /* Only zero the device if data is already there */
1132: KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1133: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1134: } else { /* Might be preallocated but not assembled */
1135: PetscCall(MatZeroEntries_SeqAIJ(A));
1136: }
1137: PetscFunctionReturn(PETSC_SUCCESS);
1138: }
1140: static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1141: {
1142: Mat_SeqAIJKokkos *aijkok;
1143: PetscInt n;
1144: PetscScalarKokkosView xv;
1146: PetscFunctionBegin;
1147: PetscCall(VecGetLocalSize(x, &n));
1148: PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1149: PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1151: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1152: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1154: const auto &Aa = aijkok->a_dual.view_device();
1155: const auto &Ai = aijkok->i_dual.view_device();
1156: const auto &Adiag = aijkok->diag_dual.view_device();
1158: PetscCall(VecGetKokkosViewWrite(x, &xv));
1159: Kokkos::parallel_for(
1160: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1161: if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1162: else xv(i) = 0;
1163: });
1164: PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1165: PetscFunctionReturn(PETSC_SUCCESS);
1166: }
1168: /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
1169: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1170: {
1171: Mat_SeqAIJKokkos *aijkok;
1173: PetscFunctionBegin;
1175: PetscAssertPointer(kv, 2);
1176: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1177: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1178: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1179: *kv = aijkok->a_dual.view_device();
1180: PetscFunctionReturn(PETSC_SUCCESS);
1181: }
1183: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1184: {
1185: PetscFunctionBegin;
1187: PetscAssertPointer(kv, 2);
1188: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1189: PetscFunctionReturn(PETSC_SUCCESS);
1190: }
1192: PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1193: {
1194: Mat_SeqAIJKokkos *aijkok;
1196: PetscFunctionBegin;
1198: PetscAssertPointer(kv, 2);
1199: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1200: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1201: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1202: *kv = aijkok->a_dual.view_device();
1203: PetscFunctionReturn(PETSC_SUCCESS);
1204: }
1206: PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1207: {
1208: PetscFunctionBegin;
1210: PetscAssertPointer(kv, 2);
1211: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1212: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1213: PetscFunctionReturn(PETSC_SUCCESS);
1214: }
1216: PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1217: {
1218: Mat_SeqAIJKokkos *aijkok;
1220: PetscFunctionBegin;
1222: PetscAssertPointer(kv, 2);
1223: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1224: aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1225: *kv = aijkok->a_dual.view_device();
1226: PetscFunctionReturn(PETSC_SUCCESS);
1227: }
1229: PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1230: {
1231: PetscFunctionBegin;
1233: PetscAssertPointer(kv, 2);
1234: PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1235: PetscCall(MatSeqAIJKokkosModifyDevice(A));
1236: PetscFunctionReturn(PETSC_SUCCESS);
1237: }
1239: PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1240: {
1241: Mat_SeqAIJKokkos *akok;
1243: PetscFunctionBegin;
1244: PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1245: PetscCall(MatCreate(comm, A));
1246: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1247: PetscFunctionReturn(PETSC_SUCCESS);
1248: }
1250: /* Computes Y += alpha X */
1251: static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1252: {
1253: Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1254: Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1255: ConstMatScalarKokkosView Xa;
1256: MatScalarKokkosView Ya;
1257: auto exec = PetscGetKokkosExecutionSpace();
1259: PetscFunctionBegin;
1260: PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1261: PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1262: PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1263: PetscCall(MatSeqAIJKokkosSyncDevice(X));
1264: PetscCall(PetscLogGpuTimeBegin());
1266: if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1267: PetscBool e;
1268: PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1269: if (e) {
1270: PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1271: if (e) pattern = SAME_NONZERO_PATTERN;
1272: }
1273: }
1275: /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1276: cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1277: If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1278: KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1279: */
1280: ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1281: xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1282: Xa = xkok->a_dual.view_device();
1283: Ya = ykok->a_dual.view_device();
1285: if (pattern == SAME_NONZERO_PATTERN) {
1286: KokkosBlas::axpy(exec, alpha, Xa, Ya);
1287: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1288: } else if (pattern == SUBSET_NONZERO_PATTERN) {
1289: MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1290: MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1292: Kokkos::parallel_for(
1293: Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1294: PetscInt i = t.league_rank(); // row i
1295: Kokkos::single(Kokkos::PerTeam(t), [=]() {
1296: // Only one thread works in a team
1297: PetscInt p, q = Yi(i);
1298: for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1299: while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1300: if (Xj(p) == Yj(q)) { // Found it
1301: Ya(q) += alpha * Xa(p);
1302: q++;
1303: } else {
1304: // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1305: // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1306: #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1307: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1308: #else
1309: if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1310: #endif
1311: }
1312: }
1313: });
1314: });
1315: PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1316: } else { // different nonzero patterns
1317: Mat Z;
1318: KokkosCsrMatrix zcsr;
1319: KernelHandle kh;
1320: kh.create_spadd_handle(true); // X, Y are sorted
1321: KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1322: KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1323: zkok = new Mat_SeqAIJKokkos(zcsr);
1324: PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1325: PetscCall(MatHeaderReplace(Y, &Z));
1326: kh.destroy_spadd_handle();
1327: }
1328: PetscCall(PetscLogGpuTimeEnd());
1329: PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1330: PetscFunctionReturn(PETSC_SUCCESS);
1331: }
1333: struct MatCOOStruct_SeqAIJKokkos {
1334: PetscCount n;
1335: PetscCount Atot;
1336: PetscInt nz;
1337: PetscCountKokkosView jmap;
1338: PetscCountKokkosView perm;
1340: MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1341: {
1342: nz = coo_h->nz;
1343: n = coo_h->n;
1344: Atot = coo_h->Atot;
1345: jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1346: perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1347: }
1348: };
1350: static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data)
1351: {
1352: PetscFunctionBegin;
1353: PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data));
1354: PetscFunctionReturn(PETSC_SUCCESS);
1355: }
1357: static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1358: {
1359: Mat_SeqAIJKokkos *akok;
1360: Mat_SeqAIJ *aseq;
1361: PetscContainer container_h;
1362: MatCOOStruct_SeqAIJ *coo_h;
1363: MatCOOStruct_SeqAIJKokkos *coo_d;
1365: PetscFunctionBegin;
1366: PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1367: aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1368: akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1369: delete akok;
1370: mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1371: PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1373: // Copy the COO struct to device
1374: PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1375: PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
1376: PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1378: // Put the COO struct in a container and then attach that to the matrix
1379: PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1380: PetscFunctionReturn(PETSC_SUCCESS);
1381: }
1383: static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1384: {
1385: MatScalarKokkosView Aa;
1386: ConstMatScalarKokkosView kv;
1387: PetscMemType memtype;
1388: PetscContainer container;
1389: MatCOOStruct_SeqAIJKokkos *coo;
1391: PetscFunctionBegin;
1392: PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1393: PetscCall(PetscContainerGetPointer(container, (void **)&coo));
1395: const auto &n = coo->n;
1396: const auto &Annz = coo->nz;
1397: const auto &jmap = coo->jmap;
1398: const auto &perm = coo->perm;
1400: PetscCall(PetscGetMemType(v, &memtype));
1401: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1402: kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1403: } else {
1404: kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1405: }
1407: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1408: else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1410: PetscCall(PetscLogGpuTimeBegin());
1411: Kokkos::parallel_for(
1412: Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1413: PetscScalar sum = 0.0;
1414: for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1415: Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1416: });
1417: PetscCall(PetscLogGpuTimeEnd());
1419: if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1420: else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1421: PetscFunctionReturn(PETSC_SUCCESS);
1422: }
1424: static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1425: {
1426: Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1428: PetscFunctionBegin;
1429: A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1430: A->boundtocpu = PETSC_FALSE;
1432: A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1433: A->ops->destroy = MatDestroy_SeqAIJKokkos;
1434: A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1435: A->ops->axpy = MatAXPY_SeqAIJKokkos;
1436: A->ops->scale = MatScale_SeqAIJKokkos;
1437: A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1438: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1439: A->ops->mult = MatMult_SeqAIJKokkos;
1440: A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1441: A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1442: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1443: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1444: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1445: A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1446: A->ops->transpose = MatTranspose_SeqAIJKokkos;
1447: A->ops->setoption = MatSetOption_SeqAIJKokkos;
1448: A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1449: A->ops->shift = MatShift_SeqAIJKokkos;
1450: A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1451: A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1452: a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1453: a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1454: a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1455: a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1456: a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1457: a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1458: a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1460: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1461: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1462: #if defined(PETSC_HAVE_HYPRE)
1463: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1464: #endif
1465: PetscFunctionReturn(PETSC_SUCCESS);
1466: }
1468: /*
1469: Extract the (prescribled) diagonal blocks of the matrix and then invert them
1471: Input Parameters:
1472: + A - the MATSEQAIJKOKKOS matrix
1473: . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1474: . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1475: . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1476: - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1478: Output Parameter:
1479: . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1480: */
1481: PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1482: {
1483: Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1484: PetscInt nblocks = bs.extent(0) - 1;
1486: PetscFunctionBegin;
1487: PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1489: // Pull out the diagonal blocks of the matrix and then invert the blocks
1490: auto Aa = akok->a_dual.view_device();
1491: auto Ai = akok->i_dual.view_device();
1492: auto Aj = akok->j_dual.view_device();
1493: auto Adiag = akok->diag_dual.view_device();
1494: // TODO: how to tune the team size?
1495: #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1496: auto ts = Kokkos::AUTO();
1497: #else
1498: auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1499: #endif
1500: PetscCallCXX(Kokkos::parallel_for(
1501: Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1502: const PetscInt bid = teamMember.league_rank(); // block id
1503: const PetscInt rstart = bs(bid); // this block starts from this row
1504: const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1505: const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1506: const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1508: Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1509: PetscInt i = rstart + r; // i-th row in A
1511: if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1512: PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1514: for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1515: if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1516: B(r, c) = 0.0;
1517: } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1518: B(r, c) = Aa(first + c);
1519: } else { // this entry does not show up in the CSR
1520: B(r, c) = 0.0;
1521: }
1522: }
1523: } else { // rare case that the diagonal does not exist
1524: const PetscInt begin = Ai(i);
1525: const PetscInt end = Ai(i + 1);
1526: for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1527: for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1528: if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1529: else if (Aj(j) >= rstart + m) break;
1530: }
1531: }
1532: });
1534: // LU-decompose B (w/o pivoting) and then invert B
1535: KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1536: KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1537: }));
1538: // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1539: PetscFunctionReturn(PETSC_SUCCESS);
1540: }
1542: PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1543: {
1544: Mat_SeqAIJ *aseq;
1545: PetscInt i, m, n;
1546: auto exec = PetscGetKokkosExecutionSpace();
1548: PetscFunctionBegin;
1549: PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1551: m = akok->nrows();
1552: n = akok->ncols();
1553: PetscCall(MatSetSizes(A, m, n, m, n));
1554: PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1556: /* Set up data structures of A as a MATSEQAIJ */
1557: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1558: aseq = (Mat_SeqAIJ *)A->data;
1560: PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1561: PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1563: aseq->i = akok->i_host_data();
1564: aseq->j = akok->j_host_data();
1565: aseq->a = akok->a_host_data();
1566: aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1567: aseq->free_a = PETSC_FALSE;
1568: aseq->free_ij = PETSC_FALSE;
1569: aseq->nz = akok->nnz();
1570: aseq->maxnz = aseq->nz;
1572: PetscCall(PetscMalloc1(m, &aseq->imax));
1573: PetscCall(PetscMalloc1(m, &aseq->ilen));
1574: for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1576: /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1577: akok->nonzerostate = A->nonzerostate;
1578: A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1579: PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1580: PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1581: PetscFunctionReturn(PETSC_SUCCESS);
1582: }
1584: PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1585: {
1586: PetscFunctionBegin;
1587: PetscCall(MatSeqAIJKokkosSyncDevice(A));
1588: *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1589: PetscFunctionReturn(PETSC_SUCCESS);
1590: }
1592: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1593: {
1594: Mat_SeqAIJKokkos *akok;
1596: PetscFunctionBegin;
1597: PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1598: PetscCall(MatCreate(comm, A));
1599: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1600: PetscFunctionReturn(PETSC_SUCCESS);
1601: }
1603: /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1605: Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1606: */
1607: PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1608: {
1609: PetscFunctionBegin;
1610: PetscCall(MatCreate(comm, A));
1611: PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1612: PetscFunctionReturn(PETSC_SUCCESS);
1613: }
1615: /*@C
1616: MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1617: (the default parallel PETSc format). This matrix will ultimately be handled by
1618: Kokkos for calculations.
1620: Collective
1622: Input Parameters:
1623: + comm - MPI communicator, set to `PETSC_COMM_SELF`
1624: . m - number of rows
1625: . n - number of columns
1626: . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1627: - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1629: Output Parameter:
1630: . A - the matrix
1632: Level: intermediate
1634: Notes:
1635: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1636: MatXXXXSetPreallocation() paradgm instead of this routine directly.
1637: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1639: The AIJ format, also called
1640: compressed row storage, is fully compatible with standard Fortran
1641: storage. That is, the stored row and column indices can begin at
1642: either one (as in Fortran) or zero.
1644: Specify the preallocated storage with either `nz` or `nnz` (not both).
1645: Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1646: allocation.
1648: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1649: @*/
1650: PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1651: {
1652: PetscFunctionBegin;
1653: PetscCall(PetscKokkosInitializeCheck());
1654: PetscCall(MatCreate(comm, A));
1655: PetscCall(MatSetSizes(*A, m, n, m, n));
1656: PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1657: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1658: PetscFunctionReturn(PETSC_SUCCESS);
1659: }
1661: // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1662: // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1663: // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1664: // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
1665: static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1666: {
1667: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1668: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1669: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1671: PetscFunctionBegin;
1672: if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1673: if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1674: if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1675: factors->sptrsv_symbolic_completed = PETSC_TRUE;
1676: }
1677: PetscFunctionReturn(PETSC_SUCCESS);
1678: }
1680: static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1681: {
1682: const PetscInt n = A->rmap->n;
1683: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1684: const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1685: const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1687: PetscFunctionBegin;
1688: if (!factors->transpose_updated) {
1689: if (has_upper) {
1690: if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1691: factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1692: factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1693: factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1694: }
1696: if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1697: if (!factors->U) {
1698: Mat_SeqAIJ *seq;
1700: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1701: PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1703: seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1704: factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1705: factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1706: factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1707: } else {
1708: PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1709: }
1710: // Copy Ut from host to device
1711: PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1712: PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1713: PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1714: } else { // If U was computed on device, we also compute the transpose there
1715: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1716: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1717: factors->jU_d, factors->aU_d,
1718: factors->iUt_d, factors->jUt_d,
1719: factors->aUt_d));
1720: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1721: }
1722: PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1723: }
1725: // do the same for L with LU
1726: if (has_lower) {
1727: if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1728: factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1729: factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1730: factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1731: }
1733: if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1734: if (!factors->L) {
1735: Mat_SeqAIJ *seq;
1737: PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1738: PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1740: seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1741: factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1742: factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1743: factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1744: } else {
1745: PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1746: }
1747: // Copy Lt from host to device
1748: PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1749: PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1750: PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1751: } else { // If L was computed on device, we also compute the transpose there
1752: // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1753: PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1754: factors->jL_d, factors->aL_d,
1755: factors->iLt_d, factors->jLt_d,
1756: factors->aLt_d));
1757: PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1758: }
1759: PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1760: }
1762: factors->transpose_updated = PETSC_TRUE;
1763: }
1764: PetscFunctionReturn(PETSC_SUCCESS);
1765: }
1767: // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1768: // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
1769: static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1770: {
1771: auto exec = PetscGetKokkosExecutionSpace();
1772: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1773: PetscInt m = A->rmap->n;
1774: PetscScalarKokkosView D = factors->D_d;
1775: PetscScalarKokkosView X, Y, B; // alias
1776: ConstPetscScalarKokkosView b;
1777: PetscScalarKokkosView x;
1778: PetscIntKokkosView &rowperm = factors->rowperm;
1779: PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1781: PetscFunctionBegin;
1782: PetscCall(PetscLogGpuTimeBegin());
1783: PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1784: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1785: PetscCall(VecGetKokkosView(bb, &b));
1786: PetscCall(VecGetKokkosViewWrite(xx, &x));
1788: // Solve U^T Y = B
1789: if (identity) { // Reorder b with the row permutation
1790: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1791: Y = factors->workVector;
1792: } else {
1793: B = factors->workVector;
1794: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1795: Y = x;
1796: }
1797: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1799: // Solve diag(D) Y' = Y.
1800: // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1801: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1803: // Solve UX = Y
1804: if (identity) {
1805: X = x;
1806: } else {
1807: X = factors->workVector; // B is not needed anymore
1808: }
1809: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1811: // Reorder X with the inverse column (row) permutation
1812: if (!identity) {
1813: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1814: }
1816: PetscCall(VecRestoreKokkosView(bb, &b));
1817: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1818: PetscCall(PetscLogGpuTimeEnd());
1819: PetscFunctionReturn(PETSC_SUCCESS);
1820: }
1822: // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1823: // R and C are represented by rowperm and colperm in factors.
1824: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1825: static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1826: {
1827: auto exec = PetscGetKokkosExecutionSpace();
1828: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1829: PetscInt m = A->rmap->n;
1830: PetscScalarKokkosView X, Y, B; // alias
1831: ConstPetscScalarKokkosView b;
1832: PetscScalarKokkosView x;
1833: PetscIntKokkosView &rowperm = factors->rowperm;
1834: PetscIntKokkosView &colperm = factors->colperm;
1835: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1836: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1838: PetscFunctionBegin;
1839: PetscCall(PetscLogGpuTimeBegin());
1840: PetscCall(MatSeqAIJKokkosSolveCheck(A));
1841: PetscCall(VecGetKokkosView(bb, &b));
1842: PetscCall(VecGetKokkosViewWrite(xx, &x));
1844: // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1845: if (row_identity) {
1846: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1847: Y = factors->workVector;
1848: } else {
1849: B = factors->workVector;
1850: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1851: Y = x;
1852: }
1853: PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1855: // Solve U C^- x = Y
1856: if (col_identity) {
1857: X = x;
1858: } else {
1859: X = factors->workVector;
1860: }
1861: PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1863: // x = C X; Reorder X with the inverse col permutation
1864: if (!col_identity) {
1865: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1866: }
1868: PetscCall(VecRestoreKokkosView(bb, &b));
1869: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1870: PetscCall(PetscLogGpuTimeEnd());
1871: PetscFunctionReturn(PETSC_SUCCESS);
1872: }
1874: // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1875: // R and C are represented by rowperm and colperm in factors.
1876: // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1877: // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
1878: static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1879: {
1880: auto exec = PetscGetKokkosExecutionSpace();
1881: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1882: PetscInt m = A->rmap->n;
1883: PetscScalarKokkosView X, Y, B; // alias
1884: ConstPetscScalarKokkosView b;
1885: PetscScalarKokkosView x;
1886: PetscIntKokkosView &rowperm = factors->rowperm;
1887: PetscIntKokkosView &colperm = factors->colperm;
1888: PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1889: PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1891: PetscFunctionBegin;
1892: PetscCall(PetscLogGpuTimeBegin());
1893: PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1894: PetscCall(VecGetKokkosView(bb, &b));
1895: PetscCall(VecGetKokkosViewWrite(xx, &x));
1897: // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1898: if (col_identity) { // Reorder b with the col permutation
1899: B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1900: Y = factors->workVector;
1901: } else {
1902: B = factors->workVector;
1903: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1904: Y = x;
1905: }
1906: PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1908: // Solve L^T X = Y
1909: if (row_identity) {
1910: X = x;
1911: } else {
1912: X = factors->workVector;
1913: }
1914: PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1916: // x = R^- X = R^T X; Reorder X with the inverse row permutation
1917: if (!row_identity) {
1918: PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1919: }
1921: PetscCall(VecRestoreKokkosView(bb, &b));
1922: PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1923: PetscCall(PetscLogGpuTimeEnd());
1924: PetscFunctionReturn(PETSC_SUCCESS);
1925: }
1927: static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1928: {
1929: PetscFunctionBegin;
1930: PetscCall(MatSeqAIJKokkosSyncHost(A));
1931: PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1933: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1934: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1935: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1936: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1937: const MatScalar *Ba = b->a;
1938: PetscInt m = B->rmap->n, n = B->cmap->n;
1940: if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1941: // Allocate memory and copy the structure
1942: factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1943: factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1944: factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1945: factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1946: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1947: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1949: PetscInt *Li = factors->iL_h.data();
1950: PetscInt *Lj = factors->jL_h.data();
1951: PetscInt *Ui = factors->iU_h.data();
1952: PetscInt *Uj = factors->jU_h.data();
1954: Li[0] = Ui[0] = 0;
1955: for (PetscInt i = 0; i < m; i++) {
1956: PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1957: PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1959: PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen); // entries of L on the left of the diagonal
1960: Lj[Li[i] + llen] = i; // diagonal entry of L
1962: Uj[Ui[i]] = i; // diagonal entry of U
1963: PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1); // entries of U on the right of the diagonal
1965: Li[i + 1] = Li[i] + llen + 1;
1966: Ui[i + 1] = Ui[i] + ulen;
1967: }
1969: factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1970: factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1971: factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1972: factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1973: factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1974: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1976: // Copy row/col permutation to device
1977: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1978: PetscBool row_identity;
1979: PetscCall(ISIdentity(rowperm, &row_identity));
1980: if (!row_identity) {
1981: const PetscInt *ip;
1983: PetscCall(ISGetIndices(rowperm, &ip));
1984: factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1985: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1986: PetscCall(ISRestoreIndices(rowperm, &ip));
1987: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1988: }
1990: IS colperm = ((Mat_SeqAIJ *)B->data)->col;
1991: PetscBool col_identity;
1992: PetscCall(ISIdentity(colperm, &col_identity));
1993: if (!col_identity) {
1994: const PetscInt *ip;
1996: PetscCall(ISGetIndices(colperm, &ip));
1997: factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
1998: PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
1999: PetscCall(ISRestoreIndices(colperm, &ip));
2000: PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2001: }
2003: /* Create sptrsv handles for L, U and their transpose */
2004: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2005: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2006: #else
2007: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2008: #endif
2009: factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2010: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2011: factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2012: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2013: }
2015: // Copy the value
2016: for (PetscInt i = 0; i < m; i++) {
2017: PetscInt llen = Bi[i + 1] - Bi[i];
2018: PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2019: const PetscInt *Li = factors->iL_h.data();
2020: const PetscInt *Ui = factors->iU_h.data();
2022: PetscScalar *La = factors->aL_h.data();
2023: PetscScalar *Ua = factors->aU_h.data();
2025: PetscArraycpy(La + Li[i], Ba + Bi[i], llen); // entries of L
2026: La[Li[i] + llen] = 1.0; // diagonal entry
2028: Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2029: PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1); // entries of U
2030: }
2032: PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2033: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2034: // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2035: factors->transpose_updated = PETSC_FALSE;
2036: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2038: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2039: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2040: }
2042: B->ops->matsolve = NULL;
2043: B->ops->matsolvetranspose = NULL;
2044: PetscFunctionReturn(PETSC_SUCCESS);
2045: }
2047: static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2048: {
2049: Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2050: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2051: PetscInt fill_lev = info->levels;
2053: PetscFunctionBegin;
2054: PetscCall(PetscLogGpuTimeBegin());
2055: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2056: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2058: auto a_d = aijkok->a_dual.view_device();
2059: auto i_d = aijkok->i_dual.view_device();
2060: auto j_d = aijkok->j_dual.view_device();
2062: PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2064: B->assembled = PETSC_TRUE;
2065: B->preallocated = PETSC_TRUE;
2066: B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2067: B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2068: B->ops->matsolve = NULL;
2069: B->ops->matsolvetranspose = NULL;
2071: /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2072: factors->transpose_updated = PETSC_FALSE;
2073: factors->sptrsv_symbolic_completed = PETSC_FALSE;
2074: /* TODO: log flops, but how to know that? */
2075: PetscCall(PetscLogGpuTimeEnd());
2076: PetscFunctionReturn(PETSC_SUCCESS);
2077: }
2079: // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
2080: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2081: {
2082: Mat_SeqAIJKokkos *aijkok;
2083: Mat_SeqAIJ *b;
2084: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2085: PetscInt fill_lev = info->levels;
2086: PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2087: PetscInt n = A->rmap->n;
2089: PetscFunctionBegin;
2090: PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2091: PetscCall(MatSeqAIJKokkosSyncDevice(A));
2093: /* Create a spiluk handle and then do symbolic factorization */
2094: nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2095: factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2097: auto spiluk_handle = factors->kh.get_spiluk_handle();
2099: Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2100: Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2101: Kokkos::realloc(factors->iU_d, n + 1);
2102: Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2104: aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2105: auto i_d = aijkok->i_dual.view_device();
2106: auto j_d = aijkok->j_dual.view_device();
2107: PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2108: /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2110: Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2111: Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2112: Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2113: Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2115: /* TODO: add options to select sptrsv algorithms */
2116: /* Create sptrsv handles for L, U and their transpose */
2117: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2118: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2119: #else
2120: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2121: #endif
2123: factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2124: factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2125: factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2126: factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2128: /* Fill fields of the factor matrix B */
2129: PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2130: b = (Mat_SeqAIJ *)B->data;
2131: b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2132: B->info.fill_ratio_given = info->fill;
2133: B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2135: B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2136: PetscFunctionReturn(PETSC_SUCCESS);
2137: }
2139: static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2140: {
2141: PetscFunctionBegin;
2142: PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2143: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2144: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2145: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2146: PetscFunctionReturn(PETSC_SUCCESS);
2147: }
2149: static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2150: {
2151: PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2153: PetscFunctionBegin;
2154: if (!info->factoronhost) {
2155: PetscCall(ISIdentity(isrow, &row_identity));
2156: PetscCall(ISIdentity(iscol, &col_identity));
2157: }
2159: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2160: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2162: if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2163: PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2164: } else {
2165: PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2166: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2167: }
2168: PetscFunctionReturn(PETSC_SUCCESS);
2169: }
2171: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2172: {
2173: PetscFunctionBegin;
2174: PetscCall(MatSeqAIJKokkosSyncHost(A));
2175: PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2177: if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2178: Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2179: Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2180: const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2181: const MatScalar *Ba = b->a;
2182: PetscInt m = B->rmap->n;
2184: if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2185: // Allocate memory and copy the structure
2186: factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2187: factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2188: factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2189: factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2190: factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2191: factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2193: // Build jU_h from the skewed Aj
2194: PetscInt *Uj = factors->jU_h.data();
2195: for (PetscInt i = 0; i < m; i++) {
2196: PetscInt ulen = Bi[i + 1] - Bi[i];
2197: Uj[Bi[i]] = i; // diagonal entry
2198: PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2199: }
2201: // Copy iU, jU to device
2202: PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2203: PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2205: // Copy row/col permutation to device
2206: IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2207: PetscBool row_identity;
2208: PetscCall(ISIdentity(rowperm, &row_identity));
2209: if (!row_identity) {
2210: const PetscInt *ip;
2212: PetscCall(ISGetIndices(rowperm, &ip));
2213: PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2214: PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2215: PetscCall(ISRestoreIndices(rowperm, &ip));
2216: PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2217: }
2219: // Create sptrsv handles for U and U^T
2220: #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2221: auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2222: #else
2223: auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2224: #endif
2225: factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2226: factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2227: }
2228: // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2229: B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2230: B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2232: // Copy the value
2233: PetscScalar *Ua = factors->aU_h.data();
2234: PetscScalar *D = factors->D_h.data();
2235: for (PetscInt i = 0; i < m; i++) {
2236: D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2237: Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2238: for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2239: }
2240: PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2241: PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2243: factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2244: factors->transpose_updated = PETSC_FALSE;
2245: }
2247: B->ops->matsolve = NULL;
2248: B->ops->matsolvetranspose = NULL;
2249: PetscFunctionReturn(PETSC_SUCCESS);
2250: }
2252: static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2253: {
2254: PetscFunctionBegin;
2255: if (info->solveonhost) {
2256: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2257: PetscCall(MatSetType(B, MATSEQSBAIJ));
2258: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2259: }
2261: PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2263: if (!info->solveonhost) {
2264: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2265: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2266: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2267: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2268: }
2269: PetscFunctionReturn(PETSC_SUCCESS);
2270: }
2272: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2273: {
2274: PetscFunctionBegin;
2275: if (info->solveonhost) {
2276: // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2277: PetscCall(MatSetType(B, MATSEQSBAIJ));
2278: PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2279: }
2281: PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2283: if (!info->solveonhost) {
2284: // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2285: PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2286: PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2287: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2288: }
2289: PetscFunctionReturn(PETSC_SUCCESS);
2290: }
2292: // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
2293: static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2294: {
2295: PetscFunctionBegin;
2296: *type = MATSOLVERKOKKOS;
2297: PetscFunctionReturn(PETSC_SUCCESS);
2298: }
2300: /*MC
2301: MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2302: on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2304: Level: beginner
2306: .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation`
2307: M*/
2308: PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2309: {
2310: PetscInt n = A->rmap->n;
2311: MPI_Comm comm;
2313: PetscFunctionBegin;
2314: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2315: PetscCall(MatCreate(comm, B));
2316: PetscCall(MatSetSizes(*B, n, n, n, n));
2317: PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2318: (*B)->factortype = ftype;
2319: PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2320: PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2321: PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2323: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2324: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2325: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2326: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2327: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2328: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2329: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2330: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2331: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2332: PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2333: PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2334: } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2336: // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2337: (*B)->canuseordering = PETSC_TRUE;
2338: PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2339: PetscFunctionReturn(PETSC_SUCCESS);
2340: }
2342: PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2343: {
2344: PetscFunctionBegin;
2345: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2346: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2347: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2348: PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2349: PetscFunctionReturn(PETSC_SUCCESS);
2350: }
2352: /* Utility to print out a KokkosCsrMatrix for debugging */
2353: PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2354: {
2355: const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2356: const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2357: const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2358: const PetscInt *i = iv.data();
2359: const PetscInt *j = jv.data();
2360: const PetscScalar *a = av.data();
2361: PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2363: PetscFunctionBegin;
2364: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2365: for (PetscInt k = 0; k < m; k++) {
2366: PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2367: for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2368: PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2369: }
2370: PetscFunctionReturn(PETSC_SUCCESS);
2371: }