Actual source code: mpiaijhipsparse.hip.cpp

  1: /* Portions of this code are under:
  2:    Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
  3: */
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqhipsparse/hipsparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpihipsparse/mpihipsparsematimpl.h>
  7: #include <thrust/advance.h>
  8: #include <thrust/partition.h>
  9: #include <thrust/sort.h>
 10: #include <thrust/unique.h>
 11: #include <petscsf.h>

 13: struct VecHIPEquals {
 14:   template <typename Tuple>
 15:   __host__ __device__ void operator()(Tuple t)
 16:   {
 17:     thrust::get<1>(t) = thrust::get<0>(t);
 18:   }
 19: };

 21: static PetscErrorCode MatCOOStructDestroy_MPIAIJHIPSPARSE(void **data)
 22: {
 23:   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;

 25:   PetscFunctionBegin;
 26:   PetscCall(PetscSFDestroy(&coo->sf));
 27:   PetscCallHIP(hipFree(coo->Ajmap1));
 28:   PetscCallHIP(hipFree(coo->Aperm1));
 29:   PetscCallHIP(hipFree(coo->Bjmap1));
 30:   PetscCallHIP(hipFree(coo->Bperm1));
 31:   PetscCallHIP(hipFree(coo->Aimap2));
 32:   PetscCallHIP(hipFree(coo->Ajmap2));
 33:   PetscCallHIP(hipFree(coo->Aperm2));
 34:   PetscCallHIP(hipFree(coo->Bimap2));
 35:   PetscCallHIP(hipFree(coo->Bjmap2));
 36:   PetscCallHIP(hipFree(coo->Bperm2));
 37:   PetscCallHIP(hipFree(coo->Cperm1));
 38:   PetscCallHIP(hipFree(coo->sendbuf));
 39:   PetscCallHIP(hipFree(coo->recvbuf));
 40:   PetscCall(PetscFree(coo));
 41:   PetscFunctionReturn(PETSC_SUCCESS);
 42: }

 44: static PetscErrorCode MatSetPreallocationCOO_MPIAIJHIPSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
 45: {
 46:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
 47:   PetscBool            dev_ij = PETSC_FALSE;
 48:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
 49:   PetscInt            *i, *j;
 50:   PetscContainer       container_h;
 51:   MatCOOStruct_MPIAIJ *coo_h, *coo_d;

 53:   PetscFunctionBegin;
 54:   PetscCall(PetscFree(mpiaij->garray));
 55:   PetscCall(VecDestroy(&mpiaij->lvec));
 56: #if defined(PETSC_USE_CTABLE)
 57:   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
 58: #else
 59:   PetscCall(PetscFree(mpiaij->colmap));
 60: #endif
 61:   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
 62:   mat->assembled     = PETSC_FALSE;
 63:   mat->was_assembled = PETSC_FALSE;
 64:   PetscCall(PetscGetMemType(coo_i, &mtype));
 65:   if (PetscMemTypeDevice(mtype)) {
 66:     dev_ij = PETSC_TRUE;
 67:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
 68:     PetscCallHIP(hipMemcpy(i, coo_i, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
 69:     PetscCallHIP(hipMemcpy(j, coo_j, coo_n * sizeof(PetscInt), hipMemcpyDeviceToHost));
 70:   } else {
 71:     i = coo_i;
 72:     j = coo_j;
 73:   }

 75:   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
 76:   if (dev_ij) PetscCall(PetscFree2(i, j));
 77:   mat->offloadmask = PETSC_OFFLOAD_CPU;
 78:   // Create the GPU memory
 79:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->A));
 80:   PetscCall(MatSeqAIJHIPSPARSECopyToGPU(mpiaij->B));

 82:   // Copy the COO struct to device
 83:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
 84:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
 85:   PetscCall(PetscMalloc1(1, &coo_d));
 86:   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d

 88:   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
 89:   PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
 90:   PetscCallHIP(hipMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
 91:   PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
 92:   PetscCallHIP(hipMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
 93:   PetscCallHIP(hipMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
 94:   PetscCallHIP(hipMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
 95:   PetscCallHIP(hipMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
 96:   PetscCallHIP(hipMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
 97:   PetscCallHIP(hipMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
 98:   PetscCallHIP(hipMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
 99:   PetscCallHIP(hipMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100:   PetscCallHIP(hipMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101:   PetscCallHIP(hipMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));

103:   PetscCallHIP(hipMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
104:   PetscCallHIP(hipMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
105:   PetscCallHIP(hipMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
106:   PetscCallHIP(hipMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), hipMemcpyHostToDevice));
107:   PetscCallHIP(hipMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
108:   PetscCallHIP(hipMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
109:   PetscCallHIP(hipMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
110:   PetscCallHIP(hipMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), hipMemcpyHostToDevice));
111:   PetscCallHIP(hipMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), hipMemcpyHostToDevice));
112:   PetscCallHIP(hipMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), hipMemcpyHostToDevice));
113:   PetscCallHIP(hipMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), hipMemcpyHostToDevice));

115:   // Put the COO struct in a container and then attach that to the matrix
116:   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJHIPSPARSE));
117:   PetscFunctionReturn(PETSC_SUCCESS);
118: }

120: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121: {
122:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
123:   const PetscCount grid_size = gridDim.x * blockDim.x;
124:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125: }

127: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128: {
129:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
130:   const PetscCount grid_size = gridDim.x * blockDim.x;
131:   for (; i < Annz + Bnnz; i += grid_size) {
132:     PetscScalar sum = 0.0;
133:     if (i < Annz) {
134:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136:     } else {
137:       i -= Annz;
138:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140:     }
141:   }
142: }

144: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145: {
146:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
147:   const PetscCount grid_size = gridDim.x * blockDim.x;
148:   for (; i < Annz2 + Bnnz2; i += grid_size) {
149:     if (i < Annz2) {
150:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151:     } else {
152:       i -= Annz2;
153:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154:     }
155:   }
156: }

158: static PetscErrorCode MatSetValuesCOO_MPIAIJHIPSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159: {
160:   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161:   Mat                  A = mpiaij->A, B = mpiaij->B;
162:   PetscScalar         *Aa, *Ba;
163:   const PetscScalar   *v1 = v;
164:   PetscMemType         memtype;
165:   PetscContainer       container;
166:   MatCOOStruct_MPIAIJ *coo;

168:   PetscFunctionBegin;
169:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
170:   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

173:   const auto &Annz   = coo->Annz;
174:   const auto &Annz2  = coo->Annz2;
175:   const auto &Bnnz   = coo->Bnnz;
176:   const auto &Bnnz2  = coo->Bnnz2;
177:   const auto &vsend  = coo->sendbuf;
178:   const auto &v2     = coo->recvbuf;
179:   const auto &Ajmap1 = coo->Ajmap1;
180:   const auto &Ajmap2 = coo->Ajmap2;
181:   const auto &Aimap2 = coo->Aimap2;
182:   const auto &Bjmap1 = coo->Bjmap1;
183:   const auto &Bjmap2 = coo->Bjmap2;
184:   const auto &Bimap2 = coo->Bimap2;
185:   const auto &Aperm1 = coo->Aperm1;
186:   const auto &Aperm2 = coo->Aperm2;
187:   const auto &Bperm1 = coo->Bperm1;
188:   const auto &Bperm2 = coo->Bperm2;
189:   const auto &Cperm1 = coo->Cperm1;

191:   PetscCall(PetscGetMemType(v, &memtype));
192:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
193:     PetscCallHIP(hipMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
194:     PetscCallHIP(hipMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), hipMemcpyHostToDevice));
195:   }

197:   if (imode == INSERT_VALUES) {
198:     PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
199:     PetscCall(MatSeqAIJHIPSPARSEGetArrayWrite(B, &Ba));
200:   } else {
201:     PetscCall(MatSeqAIJHIPSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202:     PetscCall(MatSeqAIJHIPSPARSEGetArray(B, &Ba));
203:   }

205:   PetscCall(PetscLogGpuTimeBegin());
206:   /* Pack entries to be sent to remote */
207:   if (coo->sendlen) {
208:     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatPackCOOValues), dim3((coo->sendlen + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, coo->sendlen, Cperm1, vsend);
209:     PetscCallHIP(hipPeekAtLastError());
210:   }

212:   /* Send remote entries to their owner and overlap the communication with local computation */
213:   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HIP, vsend, PETSC_MEMTYPE_HIP, v2, MPI_REPLACE));
214:   /* Add local entries to A and B */
215:   if (Annz + Bnnz > 0) {
216:     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddLocalCOOValues), dim3((Annz + Bnnz + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
217:     PetscCallHIP(hipPeekAtLastError());
218:   }
219:   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));

221:   /* Add received remote entries to A and B */
222:   if (Annz2 + Bnnz2 > 0) {
223:     hipLaunchKernelGGL(HIP_KERNEL_NAME(MatAddRemoteCOOValues), dim3((Annz2 + Bnnz2 + 255) / 256), dim3(256), 0, PetscDefaultHipStream, v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
224:     PetscCallHIP(hipPeekAtLastError());
225:   }
226:   PetscCall(PetscLogGpuTimeEnd());

228:   if (imode == INSERT_VALUES) {
229:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(A, &Aa));
230:     PetscCall(MatSeqAIJHIPSPARSERestoreArrayWrite(B, &Ba));
231:   } else {
232:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(A, &Aa));
233:     PetscCall(MatSeqAIJHIPSPARSERestoreArray(B, &Ba));
234:   }
235:   if (PetscMemTypeHost(memtype)) PetscCallHIP(hipFree((void *)v1));
236:   mat->offloadmask = PETSC_OFFLOAD_GPU;
237:   PetscFunctionReturn(PETSC_SUCCESS);
238: }

240: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241: {
242:   Mat             Ad, Ao;
243:   const PetscInt *cmap;

245:   PetscFunctionBegin;
246:   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
247:   PetscCall(MatSeqAIJHIPSPARSEMergeMats(Ad, Ao, scall, A_loc));
248:   if (glob) {
249:     PetscInt cst, i, dn, on, *gidx;

251:     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
252:     PetscCall(MatGetLocalSize(Ao, NULL, &on));
253:     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
254:     PetscCall(PetscMalloc1(dn + on, &gidx));
255:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
256:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
257:     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258:   }
259:   PetscFunctionReturn(PETSC_SUCCESS);
260: }

262: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263: {
264:   Mat_MPIAIJ          *b               = (Mat_MPIAIJ *)B->data;
265:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)b->spptr;
266:   PetscInt             i;

268:   PetscFunctionBegin;
269:   if (B->hash_active) {
270:     B->ops[0]      = b->cops;
271:     B->hash_active = PETSC_FALSE;
272:   }
273:   PetscCall(PetscLayoutSetUp(B->rmap));
274:   PetscCall(PetscLayoutSetUp(B->cmap));
275:   if (PetscDefined(USE_DEBUG) && d_nnz) {
276:     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
277:   }
278:   if (PetscDefined(USE_DEBUG) && o_nnz) {
279:     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
280:   }
281: #if defined(PETSC_USE_CTABLE)
282:   PetscCall(PetscHMapIDestroy(&b->colmap));
283: #else
284:   PetscCall(PetscFree(b->colmap));
285: #endif
286:   PetscCall(PetscFree(b->garray));
287:   PetscCall(VecDestroy(&b->lvec));
288:   PetscCall(VecScatterDestroy(&b->Mvctx));
289:   /* Because the B will have been resized we simply destroy it and create a new one each time */
290:   PetscCall(MatDestroy(&b->B));
291:   if (!b->A) {
292:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
293:     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
294:   }
295:   if (!b->B) {
296:     PetscMPIInt size;
297:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
298:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
299:     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
300:   }
301:   PetscCall(MatSetType(b->A, MATSEQAIJHIPSPARSE));
302:   PetscCall(MatSetType(b->B, MATSEQAIJHIPSPARSE));
303:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
304:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
305:   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
306:   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
307:   PetscCall(MatHIPSPARSESetFormat(b->A, MAT_HIPSPARSE_MULT, hipsparseStruct->diagGPUMatFormat));
308:   PetscCall(MatHIPSPARSESetFormat(b->B, MAT_HIPSPARSE_MULT, hipsparseStruct->offdiagGPUMatFormat));
309:   B->preallocated  = PETSC_TRUE;
310:   B->was_assembled = PETSC_FALSE;
311:   B->assembled     = PETSC_FALSE;
312:   PetscFunctionReturn(PETSC_SUCCESS);
313: }

315: static PetscErrorCode MatMult_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
316: {
317:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

319:   PetscFunctionBegin;
320:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
321:   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
322:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
324:   PetscFunctionReturn(PETSC_SUCCESS);
325: }

327: static PetscErrorCode MatZeroEntries_MPIAIJHIPSPARSE(Mat A)
328: {
329:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

331:   PetscFunctionBegin;
332:   PetscCall(MatZeroEntries(l->A));
333:   PetscCall(MatZeroEntries(l->B));
334:   PetscFunctionReturn(PETSC_SUCCESS);
335: }

337: static PetscErrorCode MatMultAdd_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338: {
339:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

341:   PetscFunctionBegin;
342:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
343:   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
344:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
346:   PetscFunctionReturn(PETSC_SUCCESS);
347: }

349: static PetscErrorCode MatMultTranspose_MPIAIJHIPSPARSE(Mat A, Vec xx, Vec yy)
350: {
351:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

353:   PetscFunctionBegin;
354:   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
355:   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
356:   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
357:   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358:   PetscFunctionReturn(PETSC_SUCCESS);
359: }

361: static PetscErrorCode MatHIPSPARSESetFormat_MPIAIJHIPSPARSE(Mat A, MatHIPSPARSEFormatOperation op, MatHIPSPARSEStorageFormat format)
362: {
363:   Mat_MPIAIJ          *a               = (Mat_MPIAIJ *)A->data;
364:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

366:   PetscFunctionBegin;
367:   switch (op) {
368:   case MAT_HIPSPARSE_MULT_DIAG:
369:     hipsparseStruct->diagGPUMatFormat = format;
370:     break;
371:   case MAT_HIPSPARSE_MULT_OFFDIAG:
372:     hipsparseStruct->offdiagGPUMatFormat = format;
373:     break;
374:   case MAT_HIPSPARSE_ALL:
375:     hipsparseStruct->diagGPUMatFormat    = format;
376:     hipsparseStruct->offdiagGPUMatFormat = format;
377:     break;
378:   default:
379:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatHIPSPARSEFormatOperation. Only MAT_HIPSPARSE_MULT_DIAG, MAT_HIPSPARSE_MULT_DIAG, and MAT_HIPSPARSE_MULT_ALL are currently supported.", op);
380:   }
381:   PetscFunctionReturn(PETSC_SUCCESS);
382: }

384: static PetscErrorCode MatSetFromOptions_MPIAIJHIPSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
385: {
386:   MatHIPSPARSEStorageFormat format;
387:   PetscBool                 flg;
388:   Mat_MPIAIJ               *a               = (Mat_MPIAIJ *)A->data;
389:   Mat_MPIAIJHIPSPARSE      *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)a->spptr;

391:   PetscFunctionBegin;
392:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJHIPSPARSE options");
393:   if (A->factortype == MAT_FACTOR_NONE) {
394:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_DIAG, format));
396:     PetscCall(PetscOptionsEnum("-mat_hipsparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
397:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_MULT_OFFDIAG, format));
398:     PetscCall(PetscOptionsEnum("-mat_hipsparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijhipsparse gpu matrices for SpMV", "MatHIPSPARSESetFormat", MatHIPSPARSEStorageFormats, (PetscEnum)hipsparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399:     if (flg) PetscCall(MatHIPSPARSESetFormat(A, MAT_HIPSPARSE_ALL, format));
400:   }
401:   PetscOptionsHeadEnd();
402:   PetscFunctionReturn(PETSC_SUCCESS);
403: }

405: static PetscErrorCode MatAssemblyEnd_MPIAIJHIPSPARSE(Mat A, MatAssemblyType mode)
406: {
407:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;

409:   PetscFunctionBegin;
410:   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
411:   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQHIP));
412:   PetscFunctionReturn(PETSC_SUCCESS);
413: }

415: static PetscErrorCode MatDestroy_MPIAIJHIPSPARSE(Mat A)
416: {
417:   Mat_MPIAIJ          *aij             = (Mat_MPIAIJ *)A->data;
418:   Mat_MPIAIJHIPSPARSE *hipsparseStruct = (Mat_MPIAIJHIPSPARSE *)aij->spptr;

420:   PetscFunctionBegin;
421:   PetscCheck(hipsparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422:   PetscCallCXX(delete hipsparseStruct);
423:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
424:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
425:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
426:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
427:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", NULL));
428:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", NULL));
429:   PetscCall(MatDestroy_MPIAIJ(A));
430:   PetscFunctionReturn(PETSC_SUCCESS);
431: }

433: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
434: {
435:   Mat_MPIAIJ *a;
436:   Mat         A;

438:   PetscFunctionBegin;
439:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
440:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
441:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
442:   A             = *newmat;
443:   A->boundtocpu = PETSC_FALSE;
444:   PetscCall(PetscFree(A->defaultvectype));
445:   PetscCall(PetscStrallocpy(VECHIP, &A->defaultvectype));

447:   a = (Mat_MPIAIJ *)A->data;
448:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJHIPSPARSE));
449:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJHIPSPARSE));
450:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQHIP));

452:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJHIPSPARSE);

454:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJHIPSPARSE;
455:   A->ops->mult                  = MatMult_MPIAIJHIPSPARSE;
456:   A->ops->multadd               = MatMultAdd_MPIAIJHIPSPARSE;
457:   A->ops->multtranspose         = MatMultTranspose_MPIAIJHIPSPARSE;
458:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJHIPSPARSE;
459:   A->ops->destroy               = MatDestroy_MPIAIJHIPSPARSE;
460:   A->ops->zeroentries           = MatZeroEntries_MPIAIJHIPSPARSE;
461:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

463:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJHIPSPARSE));
464:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJHIPSPARSE));
465:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJHIPSPARSE));
466:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatHIPSPARSESetFormat_C", MatHIPSPARSESetFormat_MPIAIJHIPSPARSE));
467:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJHIPSPARSE));
468:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJHIPSPARSE));
469: #if defined(PETSC_HAVE_HYPRE)
470:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijhipsparse_hypre_C", MatConvert_AIJ_HYPRE));
471: #endif
472:   PetscFunctionReturn(PETSC_SUCCESS);
473: }

475: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJHIPSPARSE(Mat A)
476: {
477:   PetscFunctionBegin;
478:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
479:   PetscCall(MatCreate_MPIAIJ(A));
480:   PetscCall(MatConvert_MPIAIJ_MPIAIJHIPSPARSE(A, MATMPIAIJHIPSPARSE, MAT_INPLACE_MATRIX, &A));
481:   PetscFunctionReturn(PETSC_SUCCESS);
482: }

484: /*@
485:   MatCreateAIJHIPSPARSE - Creates a sparse matrix in AIJ (compressed row) format
486:   (the default parallel PETSc format).

488:   Collective

490:   Input Parameters:
491: + comm  - MPI communicator, set to `PETSC_COMM_SELF`
492: . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
493:            This value should be the same as the local size used in creating the
494:            y vector for the matrix-vector product y = Ax.
495: . n     - This value should be the same as the local size used in creating the
496:        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
497:        calculated if `N` is given) For square matrices `n` is almost always `m`.
498: . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
499: . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
500: . d_nz  - number of nonzeros per row (same for all rows), for the "diagonal" portion of the matrix
501: . d_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "diagonal" portion of the matrix
502: . o_nz  - number of nonzeros per row (same for all rows), for the "off-diagonal" portion of the matrix
503: - o_nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`, for the "off-diagonal" portion of the matrix

505:   Output Parameter:
506: . A - the matrix

508:   Level: intermediate

510:   Notes:
511:   This matrix will ultimately pushed down to AMD GPUs and use the HIPSPARSE library for
512:   calculations. For good matrix assembly performance the user should preallocate the matrix
513:   storage by setting the parameter `nz` (or the array `nnz`).

515:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
516:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
517:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

519:   If `d_nnz` (`o_nnz`) is given then `d_nz` (`o_nz`) is ignored

521:   The `MATAIJ` format (compressed row storage), is fully compatible with standard Fortran
522:   storage.  That is, the stored row and column indices can begin at
523:   either one (as in Fortran) or zero.

525:   Specify the preallocated storage with either `d_nz` (`o_nz`) or `d_nnz` (`o_nnz`) (not both).
526:   Set `d_nz` (`o_nz`) = `PETSC_DEFAULT` and `d_nnz` (`o_nnz`) = `NULL` for PETSc to control dynamic memory
527:   allocation.

529: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJHIPSPARSE`, `MATAIJHIPSPARSE`
530: @*/
531: PetscErrorCode MatCreateAIJHIPSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
532: {
533:   PetscMPIInt size;

535:   PetscFunctionBegin;
536:   PetscCall(MatCreate(comm, A));
537:   PetscCall(MatSetSizes(*A, m, n, M, N));
538:   PetscCallMPI(MPI_Comm_size(comm, &size));
539:   if (size > 1) {
540:     PetscCall(MatSetType(*A, MATMPIAIJHIPSPARSE));
541:     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
542:   } else {
543:     PetscCall(MatSetType(*A, MATSEQAIJHIPSPARSE));
544:     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
545:   }
546:   PetscFunctionReturn(PETSC_SUCCESS);
547: }

549: /*MC
550:    MATAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJHIPSPARSE`.

552:    A matrix type whose data resides on GPUs. These matrices can be in either
553:    CSR, ELL, or Hybrid format. All matrix calculations are performed on AMD GPUs using the HIPSPARSE library.

555:    This matrix type is identical to `MATSEQAIJHIPSPARSE` when constructed with a single process communicator,
556:    and `MATMPIAIJHIPSPARSE` otherwise.  As a result, for single process communicators,
557:    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
558:    for communicators controlling multiple processes.  It is recommended that you call both of
559:    the above preallocation routines for simplicity.

561:    Options Database Keys:
562: +  -mat_type mpiaijhipsparse - sets the matrix type to `MATMPIAIJHIPSPARSE`
563: .  -mat_hipsparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
564: .  -mat_hipsparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
565: -  -mat_hipsparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

567:   Level: beginner

569: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJHIPSPARSE()`, `MATSEQAIJHIPSPARSE`, `MATMPIAIJHIPSPARSE`, `MatCreateSeqAIJHIPSPARSE()`, `MatHIPSPARSESetFormat()`, `MatHIPSPARSEStorageFormat`, `MatHIPSPARSEFormatOperation`
570: M*/

572: /*MC
573:    MATMPIAIJHIPSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJHIPSPARSE`.

575:   Level: beginner

577: .seealso: [](ch_matrices), `Mat`, `MATAIJHIPSPARSE`, `MATSEQAIJHIPSPARSE`
578: M*/