Actual source code: mpiaijcusparse.cu
  1: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  3: #include <petscconf.h>
  4: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  5: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  6: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  7: #include <thrust/advance.h>
  8: #include <thrust/partition.h>
  9: #include <thrust/sort.h>
 10: #include <thrust/unique.h>
 11: #include <petscsf.h>

 13: struct VecCUDAEquals {
 14:   template <typename Tuple>
 15:   __host__ __device__ void operator()(Tuple t)
 16:   {
 17:     thrust::get<1>(t) = thrust::get<0>(t);
 18:   }
 19: };

 21: static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
 22: {
 23:   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;

 25:   PetscFunctionBegin;
 26:   PetscCall(PetscSFDestroy(&coo->sf));
 27:   PetscCallCUDA(cudaFree(coo->Ajmap1));
 28:   PetscCallCUDA(cudaFree(coo->Aperm1));
 29:   PetscCallCUDA(cudaFree(coo->Bjmap1));
 30:   PetscCallCUDA(cudaFree(coo->Bperm1));
 31:   PetscCallCUDA(cudaFree(coo->Aimap2));
 32:   PetscCallCUDA(cudaFree(coo->Ajmap2));
 33:   PetscCallCUDA(cudaFree(coo->Aperm2));
 34:   PetscCallCUDA(cudaFree(coo->Bimap2));
 35:   PetscCallCUDA(cudaFree(coo->Bjmap2));
 36:   PetscCallCUDA(cudaFree(coo->Bperm2));
 37:   PetscCallCUDA(cudaFree(coo->Cperm1));
 38:   PetscCallCUDA(cudaFree(coo->sendbuf));
 39:   PetscCallCUDA(cudaFree(coo->recvbuf));
 40:   PetscCall(PetscFree(coo));
 41:   PetscFunctionReturn(PETSC_SUCCESS);
 42: }

 44: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
 45: {
 46:   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
 47:   PetscBool            dev_ij = PETSC_FALSE;
 48:   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
 49:   PetscInt            *i, *j;
 50:   PetscContainer       container_h, container_d;
 51:   MatCOOStruct_MPIAIJ *coo_h, *coo_d;

 53:   PetscFunctionBegin;
 54:   PetscCall(PetscFree(mpiaij->garray));
 55:   PetscCall(VecDestroy(&mpiaij->lvec));
 56: #if defined(PETSC_USE_CTABLE)
 57:   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
 58: #else
 59:   PetscCall(PetscFree(mpiaij->colmap));
 60: #endif
 61:   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
 62:   mat->assembled     = PETSC_FALSE;
 63:   mat->was_assembled = PETSC_FALSE;
 64:   PetscCall(PetscGetMemType(coo_i, &mtype));
 65:   if (PetscMemTypeDevice(mtype)) {
 66:     dev_ij = PETSC_TRUE;
 67:     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
 68:     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
 69:     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
 70:   } else {
 71:     i = coo_i;
 72:     j = coo_j;
 73:   }

 75:   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
 76:   if (dev_ij) PetscCall(PetscFree2(i, j));
 77:   mat->offloadmask = PETSC_OFFLOAD_CPU;
 78:   // Create the GPU memory
 79:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
 80:   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));

 82:   // Copy the COO struct to device
 83:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
 84:   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
 85:   PetscCall(PetscMalloc1(1, &coo_d));
 86:   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d

 88:   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
 89:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
 90:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
 91:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
 92:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
 93:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
 94:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
 95:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
 96:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
 97:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
 98:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
 99:   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100:   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101:   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));

103:   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104:   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105:   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106:   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107:   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108:   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109:   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110:   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111:   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112:   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113:   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));

115:   // Put the COO struct in a container and then attach that to the matrix
116:   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
117:   PetscCall(PetscContainerSetPointer(container_d, coo_d));
118:   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
119:   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
120:   PetscCall(PetscContainerDestroy(&container_d));
121:   PetscFunctionReturn(PETSC_SUCCESS);
122: }

124: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125: {
126:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127:   const PetscCount grid_size = gridDim.x * blockDim.x;
128:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129: }

131: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132: {
133:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134:   const PetscCount grid_size = gridDim.x * blockDim.x;
135:   for (; i < Annz + Bnnz; i += grid_size) {
136:     PetscScalar sum = 0.0;
137:     if (i < Annz) {
138:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140:     } else {
141:       i -= Annz;
142:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144:     }
145:   }
146: }

148: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149: {
150:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151:   const PetscCount grid_size = gridDim.x * blockDim.x;
152:   for (; i < Annz2 + Bnnz2; i += grid_size) {
153:     if (i < Annz2) {
154:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155:     } else {
156:       i -= Annz2;
157:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158:     }
159:   }
160: }

162: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163: {
164:   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165:   Mat                  A = mpiaij->A, B = mpiaij->B;
166:   PetscScalar         *Aa, *Ba;
167:   const PetscScalar   *v1 = v;
168:   PetscMemType         memtype;
169:   PetscContainer       container;
170:   MatCOOStruct_MPIAIJ *coo;

172:   PetscFunctionBegin;
173:   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
174:   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
175:   PetscCall(PetscContainerGetPointer(container, (void **)&coo));

177:   const auto &Annz   = coo->Annz;
178:   const auto &Annz2  = coo->Annz2;
179:   const auto &Bnnz   = coo->Bnnz;
180:   const auto &Bnnz2  = coo->Bnnz2;
181:   const auto &vsend  = coo->sendbuf;
182:   const auto &v2     = coo->recvbuf;
183:   const auto &Ajmap1 = coo->Ajmap1;
184:   const auto &Ajmap2 = coo->Ajmap2;
185:   const auto &Aimap2 = coo->Aimap2;
186:   const auto &Bjmap1 = coo->Bjmap1;
187:   const auto &Bjmap2 = coo->Bjmap2;
188:   const auto &Bimap2 = coo->Bimap2;
189:   const auto &Aperm1 = coo->Aperm1;
190:   const auto &Aperm2 = coo->Aperm2;
191:   const auto &Bperm1 = coo->Bperm1;
192:   const auto &Bperm2 = coo->Bperm2;
193:   const auto &Cperm1 = coo->Cperm1;

195:   PetscCall(PetscGetMemType(v, &memtype));
196:   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
197:     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
198:     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199:   }

201:   if (imode == INSERT_VALUES) {
202:     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
203:     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204:   } else {
205:     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206:     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207:   }

209:   PetscCall(PetscLogGpuTimeBegin());
210:   /* Pack entries to be sent to remote */
211:   if (coo->sendlen) {
212:     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
213:     PetscCallCUDA(cudaPeekAtLastError());
214:   }

216:   /* Send remote entries to their owner and overlap the communication with local computation */
217:   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
218:   /* Add local entries to A and B */
219:   if (Annz + Bnnz > 0) {
220:     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
221:     PetscCallCUDA(cudaPeekAtLastError());
222:   }
223:   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));

225:   /* Add received remote entries to A and B */
226:   if (Annz2 + Bnnz2 > 0) {
227:     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
228:     PetscCallCUDA(cudaPeekAtLastError());
229:   }
230:   PetscCall(PetscLogGpuTimeEnd());

232:   if (imode == INSERT_VALUES) {
233:     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
234:     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
235:   } else {
236:     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
237:     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
238:   }
239:   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
240:   mat->offloadmask = PETSC_OFFLOAD_GPU;
241:   PetscFunctionReturn(PETSC_SUCCESS);
242: }

244: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
245: {
246:   Mat             Ad, Ao;
247:   const PetscInt *cmap;

249:   PetscFunctionBegin;
250:   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
251:   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
252:   if (glob) {
253:     PetscInt cst, i, dn, on, *gidx;

255:     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
256:     PetscCall(MatGetLocalSize(Ao, NULL, &on));
257:     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
258:     PetscCall(PetscMalloc1(dn + on, &gidx));
259:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
260:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
261:     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
262:   }
263:   PetscFunctionReturn(PETSC_SUCCESS);
264: }

266: static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
267: {
268:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
269:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
270:   PetscInt            i;

272:   PetscFunctionBegin;
273:   if (B->hash_active) {
274:     B->ops[0]      = b->cops;
275:     B->hash_active = PETSC_FALSE;
276:   }
277:   PetscCall(PetscLayoutSetUp(B->rmap));
278:   PetscCall(PetscLayoutSetUp(B->cmap));
279:   if (PetscDefined(USE_DEBUG) && d_nnz) {
280:     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
281:   }
282:   if (PetscDefined(USE_DEBUG) && o_nnz) {
283:     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
284:   }
285: #if defined(PETSC_USE_CTABLE)
286:   PetscCall(PetscHMapIDestroy(&b->colmap));
287: #else
288:   PetscCall(PetscFree(b->colmap));
289: #endif
290:   PetscCall(PetscFree(b->garray));
291:   PetscCall(VecDestroy(&b->lvec));
292:   PetscCall(VecScatterDestroy(&b->Mvctx));
293:   /* Because the B will have been resized we simply destroy it and create a new one each time */
294:   PetscCall(MatDestroy(&b->B));
295:   if (!b->A) {
296:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
297:     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
298:   }
299:   if (!b->B) {
300:     PetscMPIInt size;
301:     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
302:     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
303:     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
304:   }
305:   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
306:   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
307:   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
308:   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
309:   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
310:   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
311:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
312:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
313:   B->preallocated  = PETSC_TRUE;
314:   B->was_assembled = PETSC_FALSE;
315:   B->assembled     = PETSC_FALSE;
316:   PetscFunctionReturn(PETSC_SUCCESS);
317: }

319: static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
320: {
321:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

323:   PetscFunctionBegin;
324:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
325:   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
326:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
327:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
328:   PetscFunctionReturn(PETSC_SUCCESS);
329: }

331: static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
332: {
333:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

335:   PetscFunctionBegin;
336:   PetscCall(MatZeroEntries(l->A));
337:   PetscCall(MatZeroEntries(l->B));
338:   PetscFunctionReturn(PETSC_SUCCESS);
339: }

341: static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
342: {
343:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

345:   PetscFunctionBegin;
346:   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
347:   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
348:   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
349:   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
350:   PetscFunctionReturn(PETSC_SUCCESS);
351: }

353: static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
354: {
355:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

357:   PetscFunctionBegin;
358:   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
359:   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
360:   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
361:   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
362:   PetscFunctionReturn(PETSC_SUCCESS);
363: }

365: static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
366: {
367:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
368:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

370:   PetscFunctionBegin;
371:   switch (op) {
372:   case MAT_CUSPARSE_MULT_DIAG:
373:     cusparseStruct->diagGPUMatFormat = format;
374:     break;
375:   case MAT_CUSPARSE_MULT_OFFDIAG:
376:     cusparseStruct->offdiagGPUMatFormat = format;
377:     break;
378:   case MAT_CUSPARSE_ALL:
379:     cusparseStruct->diagGPUMatFormat    = format;
380:     cusparseStruct->offdiagGPUMatFormat = format;
381:     break;
382:   default:
383:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
384:   }
385:   PetscFunctionReturn(PETSC_SUCCESS);
386: }

388: static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
389: {
390:   MatCUSPARSEStorageFormat format;
391:   PetscBool                flg;
392:   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
393:   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

395:   PetscFunctionBegin;
396:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
397:   if (A->factortype == MAT_FACTOR_NONE) {
398:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
400:     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
401:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
402:     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
403:     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
404:   }
405:   PetscOptionsHeadEnd();
406:   PetscFunctionReturn(PETSC_SUCCESS);
407: }

409: static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
410: {
411:   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;

413:   PetscFunctionBegin;
414:   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
415:   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
416:   PetscFunctionReturn(PETSC_SUCCESS);
417: }

419: static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
420: {
421:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
422:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;

424:   PetscFunctionBegin;
425:   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
426:   PetscCallCXX(delete cusparseStruct);
427:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
428:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
429:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
430:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
431:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
432:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
433:   PetscCall(MatDestroy_MPIAIJ(A));
434:   PetscFunctionReturn(PETSC_SUCCESS);
435: }

437: /* defines MatSetValues_MPICUSPARSE_Hash() */
438: #define TYPE AIJ
439: #define TYPE_AIJ
440: #define SUB_TYPE_CUSPARSE
441: #include "../src/mat/impls/aij/mpi/mpihashmat.h"
442: #undef TYPE
443: #undef TYPE_AIJ
444: #undef SUB_TYPE_CUSPARSE

446: static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
447: {
448:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
449:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;

451:   PetscFunctionBegin;
452:   PetscCall(MatSetUp_MPI_Hash(A));
453:   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
454:   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
455:   A->preallocated = PETSC_TRUE;
456:   PetscFunctionReturn(PETSC_SUCCESS);
457: }

459: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
460: {
461:   Mat_MPIAIJ *a;
462:   Mat         A;

464:   PetscFunctionBegin;
465:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
466:   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
467:   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
468:   A             = *newmat;
469:   A->boundtocpu = PETSC_FALSE;
470:   PetscCall(PetscFree(A->defaultvectype));
471:   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));

473:   a = (Mat_MPIAIJ *)A->data;
474:   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
475:   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
476:   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));

478:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);

480:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
481:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
482:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
483:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
484:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
485:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
486:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
487:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
488:   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;

490:   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
491:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
492:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
493:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
494:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
495:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
496: #if defined(PETSC_HAVE_HYPRE)
497:   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
498: #endif
499:   PetscFunctionReturn(PETSC_SUCCESS);
500: }

502: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
503: {
504:   PetscFunctionBegin;
505:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
506:   PetscCall(MatCreate_MPIAIJ(A));
507:   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
508:   PetscFunctionReturn(PETSC_SUCCESS);
509: }

511: /*@
512:   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
513:   (the default parallel PETSc format).  This matrix will ultimately pushed down
514:   to NVIDIA GPUs and use the CuSPARSE library for calculations.

516:   Collective

518:   Input Parameters:
519: + comm  - MPI communicator, set to `PETSC_COMM_SELF`
520: . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
521:            This value should be the same as the local size used in creating the
522:            y vector for the matrix-vector product y = Ax.
523: . n     - This value should be the same as the local size used in creating the
524:        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
525:        calculated if `N` is given) For square matrices `n` is almost always `m`.
526: . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
527: . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
528: . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
529:            (same value is used for all local rows)
530: . d_nnz - array containing the number of nonzeros in the various rows of the
531:            DIAGONAL portion of the local submatrix (possibly different for each row)
532:            or `NULL`, if `d_nz` is used to specify the nonzero structure.
533:            The size of this array is equal to the number of local rows, i.e `m`.
534:            For matrices you plan to factor you must leave room for the diagonal entry and
535:            put in the entry even if it is zero.
536: . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
537:            submatrix (same value is used for all local rows).
538: - o_nnz - array containing the number of nonzeros in the various rows of the
539:            OFF-DIAGONAL portion of the local submatrix (possibly different for
540:            each row) or `NULL`, if `o_nz` is used to specify the nonzero
541:            structure. The size of this array is equal to the number
542:            of local rows, i.e `m`.

544:   Output Parameter:
545: . A - the matrix

547:   Level: intermediate

549:   Notes:
550:   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
551:   MatXXXXSetPreallocation() paradigm instead of this routine directly.
552:   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

554:   The AIJ format, also called the
555:   compressed row storage), is fully compatible with standard Fortran
556:   storage.  That is, the stored row and column indices can begin at
557:   either one (as in Fortran) or zero.

559: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
560: @*/
561: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
562: {
563:   PetscMPIInt size;

565:   PetscFunctionBegin;
566:   PetscCall(MatCreate(comm, A));
567:   PetscCall(MatSetSizes(*A, m, n, M, N));
568:   PetscCallMPI(MPI_Comm_size(comm, &size));
569:   if (size > 1) {
570:     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
571:     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
572:   } else {
573:     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
574:     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
575:   }
576:   PetscFunctionReturn(PETSC_SUCCESS);
577: }

579: /*MC
580:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.

582:    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
583:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
584:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

586:    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
587:    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
588:    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
589:    for communicators controlling multiple processes.  It is recommended that you call both of
590:    the above preallocation routines for simplicity.

592:    Options Database Keys:
593: +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
594: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
595: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
596: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).

598:   Level: beginner

600: .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
601: M*/

603: /*MC
604:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.

606:   Level: beginner

608: .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
609: M*/