Actual source code: mpiaijcusparse.cu

  1: #define PETSC_SKIP_SPINLOCK
  2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  4: #include <petscconf.h>
  5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
  6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
  7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
  8: #include <thrust/advance.h>
  9: #include <thrust/partition.h>
 10: #include <thrust/sort.h>
 11: #include <thrust/unique.h>
 12: #include <petscsf.h>

 14: struct VecCUDAEquals {
 15:   template <typename Tuple>
 16:   __host__ __device__ void operator()(Tuple t)
 17:   {
 18:     thrust::get<1>(t) = thrust::get<0>(t);
 19:   }
 20: };

 22: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
 23: {
 24:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
 25:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;

 27:   if (!cusparseStruct) return 0;
 28:   if (cusparseStruct->use_extended_coo) {
 29:     cudaFree(cusparseStruct->Ajmap1_d);
 30:     cudaFree(cusparseStruct->Aperm1_d);
 31:     cudaFree(cusparseStruct->Bjmap1_d);
 32:     cudaFree(cusparseStruct->Bperm1_d);
 33:     cudaFree(cusparseStruct->Aimap2_d);
 34:     cudaFree(cusparseStruct->Ajmap2_d);
 35:     cudaFree(cusparseStruct->Aperm2_d);
 36:     cudaFree(cusparseStruct->Bimap2_d);
 37:     cudaFree(cusparseStruct->Bjmap2_d);
 38:     cudaFree(cusparseStruct->Bperm2_d);
 39:     cudaFree(cusparseStruct->Cperm1_d);
 40:     cudaFree(cusparseStruct->sendbuf_d);
 41:     cudaFree(cusparseStruct->recvbuf_d);
 42:   }
 43:   cusparseStruct->use_extended_coo = PETSC_FALSE;
 44:   delete cusparseStruct->coo_p;
 45:   delete cusparseStruct->coo_pw;
 46:   cusparseStruct->coo_p  = NULL;
 47:   cusparseStruct->coo_pw = NULL;
 48:   return 0;
 49: }

 51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
 52: {
 53:   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
 54:   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
 55:   PetscInt            n    = cusp->coo_nd + cusp->coo_no;

 57:   if (cusp->coo_p && v) {
 58:     thrust::device_ptr<const PetscScalar> d_v;
 59:     THRUSTARRAY                          *w = NULL;

 61:     if (isCudaMem(v)) {
 62:       d_v = thrust::device_pointer_cast(v);
 63:     } else {
 64:       w = new THRUSTARRAY(n);
 65:       w->assign(v, v + n);
 66:       PetscLogCpuToGpu(n * sizeof(PetscScalar));
 67:       d_v = w->data();
 68:     }

 70:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
 71:     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
 72:     PetscLogGpuTimeBegin();
 73:     thrust::for_each(zibit, zieit, VecCUDAEquals());
 74:     PetscLogGpuTimeEnd();
 75:     delete w;
 76:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode);
 77:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode);
 78:   } else {
 79:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode);
 80:     MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode);
 81:   }
 82:   return 0;
 83: }

 85: template <typename Tuple>
 86: struct IsNotOffDiagT {
 87:   PetscInt _cstart, _cend;

 89:   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 90:   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
 91: };

 93: struct IsOffDiag {
 94:   PetscInt _cstart, _cend;

 96:   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
 97:   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
 98: };

100: struct GlobToLoc {
101:   PetscInt _start;

103:   GlobToLoc(PetscInt start) : _start(start) { }
104:   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
105: };

107: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
108: {
109:   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
110:   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
111:   PetscInt               N, *jj;
112:   size_t                 noff = 0;
113:   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
114:   THRUSTINTARRAY         d_j(n);
115:   ISLocalToGlobalMapping l2g;

117:   MatDestroy(&b->A);
118:   MatDestroy(&b->B);

120:   PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
121:   d_i.assign(coo_i, coo_i + n);
122:   d_j.assign(coo_j, coo_j + n);
123:   PetscLogGpuTimeBegin();
124:   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
125:   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
126:   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
127:     cusp->coo_p  = new THRUSTINTARRAY(n);
128:     cusp->coo_pw = new THRUSTARRAY(n);
129:     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
130:     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
131:     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
132:     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
133:     firstoffd  = mzipp.get_iterator_tuple().get<1>();
134:   }
135:   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
136:   cusp->coo_no = thrust::distance(firstoffd, d_j.end());

138:   /* from global to local */
139:   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
140:   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
141:   PetscLogGpuTimeEnd();

143:   /* copy offdiag column indices to map on the CPU */
144:   PetscMalloc1(cusp->coo_no, &jj); /* jj[] will store compacted col ids of the offdiag part */
145:   cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost);
146:   auto o_j = d_j.begin();
147:   PetscLogGpuTimeBegin();
148:   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
149:   thrust::sort(thrust::device, o_j, d_j.end());
150:   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
151:   PetscLogGpuTimeEnd();
152:   noff = thrust::distance(o_j, wit);
153:   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
154:   PetscMalloc1(noff, &b->garray);
155:   cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost);
156:   PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt));
157:   ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g);
158:   ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH);
159:   ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj);
161:   ISLocalToGlobalMappingDestroy(&l2g);

163:   MatCreate(PETSC_COMM_SELF, &b->A);
164:   MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
165:   MatSetType(b->A, MATSEQAIJCUSPARSE);
166:   MatCreate(PETSC_COMM_SELF, &b->B);
167:   MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff);
168:   MatSetType(b->B, MATSEQAIJCUSPARSE);

170:   /* GPU memory, cusparse specific call handles it internally */
171:   MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get());
172:   MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj);
173:   PetscFree(jj);

175:   MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat);
176:   MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat);

178:   MatBindToCPU(b->A, B->boundtocpu);
179:   MatBindToCPU(b->B, B->boundtocpu);
180:   MatSetUpMultiply_MPIAIJ(B);
181:   return 0;
182: }

184: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
185: {
186:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
187:   Mat_MPIAIJCUSPARSE *mpidev;
188:   PetscBool           coo_basic = PETSC_TRUE;
189:   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
190:   PetscInt            rstart, rend;

192:   PetscFree(mpiaij->garray);
193:   VecDestroy(&mpiaij->lvec);
194: #if defined(PETSC_USE_CTABLE)
195:   PetscTableDestroy(&mpiaij->colmap);
196: #else
197:   PetscFree(mpiaij->colmap);
198: #endif
199:   VecScatterDestroy(&mpiaij->Mvctx);
200:   mat->assembled     = PETSC_FALSE;
201:   mat->was_assembled = PETSC_FALSE;
202:   MatResetPreallocationCOO_MPIAIJ(mat);
203:   MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);
204:   if (coo_i) {
205:     PetscLayoutGetRange(mat->rmap, &rstart, &rend);
206:     PetscGetMemType(coo_i, &mtype);
207:     if (PetscMemTypeHost(mtype)) {
208:       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
209:         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
210:           coo_basic = PETSC_FALSE;
211:           break;
212:         }
213:       }
214:     }
215:   }
216:   /* All ranks must agree on the value of coo_basic */
217:   MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat));
218:   if (coo_basic) {
219:     MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j);
220:   } else {
221:     MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
222:     mat->offloadmask = PETSC_OFFLOAD_CPU;
223:     /* creates the GPU memory */
224:     MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);
225:     MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);
226:     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
227:     mpidev->use_extended_coo = PETSC_TRUE;

229:     cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount));
230:     cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount));

232:     cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount));
233:     cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount));

235:     cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount));
236:     cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount));
237:     cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount));

239:     cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount));
240:     cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount));
241:     cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount));

243:     cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount));
244:     cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar));
245:     cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar));

247:     cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
248:     cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice);

250:     cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
251:     cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice);

253:     cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
254:     cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
255:     cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice);

257:     cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
258:     cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
259:     cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice);

261:     cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice);
262:   }
263:   return 0;
264: }

266: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
267: {
268:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
269:   const PetscCount grid_size = gridDim.x * blockDim.x;
270:   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
271: }

273: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
274: {
275:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
276:   const PetscCount grid_size = gridDim.x * blockDim.x;
277:   for (; i < Annz + Bnnz; i += grid_size) {
278:     PetscScalar sum = 0.0;
279:     if (i < Annz) {
280:       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
281:       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
282:     } else {
283:       i -= Annz;
284:       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
285:       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
286:     }
287:   }
288: }

290: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
291: {
292:   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
293:   const PetscCount grid_size = gridDim.x * blockDim.x;
294:   for (; i < Annz2 + Bnnz2; i += grid_size) {
295:     if (i < Annz2) {
296:       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
297:     } else {
298:       i -= Annz2;
299:       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
300:     }
301:   }
302: }

304: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
305: {
306:   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
307:   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
308:   Mat                 A = mpiaij->A, B = mpiaij->B;
309:   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
310:   PetscScalar        *Aa, *Ba = NULL;
311:   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
312:   const PetscScalar  *v1     = v;
313:   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
314:   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
315:   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
316:   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
317:   PetscMemType        memtype;

319:   if (mpidev->use_extended_coo) {
320:     PetscMPIInt size;

322:     MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
323:     PetscGetMemType(v, &memtype);
324:     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
325:       cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar));
326:       cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
327:     }

329:     if (imode == INSERT_VALUES) {
330:       MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa); /* write matrix values */
331:       MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba);
332:     } else {
333:       MatSeqAIJCUSPARSEGetArray(A, &Aa); /* read & write matrix values */
334:       MatSeqAIJCUSPARSEGetArray(B, &Ba);
335:     }

337:     /* Pack entries to be sent to remote */
338:     if (mpiaij->sendlen) {
339:       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
340:       cudaPeekAtLastError();
341:     }

343:     /* Send remote entries to their owner and overlap the communication with local computation */
344:     PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE);
345:     /* Add local entries to A and B */
346:     if (Annz + Bnnz > 0) {
347:       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
348:       cudaPeekAtLastError();
349:     }
350:     PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE);

352:     /* Add received remote entries to A and B */
353:     if (Annz2 + Bnnz2 > 0) {
354:       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
355:       cudaPeekAtLastError();
356:     }

358:     if (imode == INSERT_VALUES) {
359:       MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa);
360:       MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba);
361:     } else {
362:       MatSeqAIJCUSPARSERestoreArray(A, &Aa);
363:       MatSeqAIJCUSPARSERestoreArray(B, &Ba);
364:     }
365:     if (PetscMemTypeHost(memtype)) cudaFree((void *)v1);
366:   } else {
367:     MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode);
368:   }
369:   mat->offloadmask = PETSC_OFFLOAD_GPU;
370:   return 0;
371: }

373: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
374: {
375:   Mat             Ad, Ao;
376:   const PetscInt *cmap;

378:   MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap);
379:   MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc);
380:   if (glob) {
381:     PetscInt cst, i, dn, on, *gidx;

383:     MatGetLocalSize(Ad, NULL, &dn);
384:     MatGetLocalSize(Ao, NULL, &on);
385:     MatGetOwnershipRangeColumn(A, &cst, NULL);
386:     PetscMalloc1(dn + on, &gidx);
387:     for (i = 0; i < dn; i++) gidx[i] = cst + i;
388:     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
389:     ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
390:   }
391:   return 0;
392: }

394: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
395: {
396:   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
397:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
398:   PetscInt            i;

400:   PetscLayoutSetUp(B->rmap);
401:   PetscLayoutSetUp(B->cmap);
402:   if (PetscDefined(USE_DEBUG) && d_nnz) {
404:   }
405:   if (PetscDefined(USE_DEBUG) && o_nnz) {
407:   }
408: #if defined(PETSC_USE_CTABLE)
409:   PetscTableDestroy(&b->colmap);
410: #else
411:   PetscFree(b->colmap);
412: #endif
413:   PetscFree(b->garray);
414:   VecDestroy(&b->lvec);
415:   VecScatterDestroy(&b->Mvctx);
416:   /* Because the B will have been resized we simply destroy it and create a new one each time */
417:   MatDestroy(&b->B);
418:   if (!b->A) {
419:     MatCreate(PETSC_COMM_SELF, &b->A);
420:     MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
421:   }
422:   if (!b->B) {
423:     PetscMPIInt size;
424:     MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
425:     MatCreate(PETSC_COMM_SELF, &b->B);
426:     MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0);
427:   }
428:   MatSetType(b->A, MATSEQAIJCUSPARSE);
429:   MatSetType(b->B, MATSEQAIJCUSPARSE);
430:   MatBindToCPU(b->A, B->boundtocpu);
431:   MatBindToCPU(b->B, B->boundtocpu);
432:   MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz);
433:   MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz);
434:   MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat);
435:   MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat);
436:   B->preallocated = PETSC_TRUE;
437:   return 0;
438: }

440: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
441: {
442:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

444:   VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
445:   (*a->A->ops->mult)(a->A, xx, yy);
446:   VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
447:   (*a->B->ops->multadd)(a->B, a->lvec, yy, yy);
448:   return 0;
449: }

451: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
452: {
453:   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;

455:   MatZeroEntries(l->A);
456:   MatZeroEntries(l->B);
457:   return 0;
458: }

460: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

464:   VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
465:   (*a->A->ops->multadd)(a->A, xx, yy, zz);
466:   VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
467:   (*a->B->ops->multadd)(a->B, a->lvec, zz, zz);
468:   return 0;
469: }

471: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
472: {
473:   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;

475:   (*a->B->ops->multtranspose)(a->B, xx, a->lvec);
476:   (*a->A->ops->multtranspose)(a->A, xx, yy);
477:   VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
478:   VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
479:   return 0;
480: }

482: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
483: {
484:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
485:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

487:   switch (op) {
488:   case MAT_CUSPARSE_MULT_DIAG:
489:     cusparseStruct->diagGPUMatFormat = format;
490:     break;
491:   case MAT_CUSPARSE_MULT_OFFDIAG:
492:     cusparseStruct->offdiagGPUMatFormat = format;
493:     break;
494:   case MAT_CUSPARSE_ALL:
495:     cusparseStruct->diagGPUMatFormat    = format;
496:     cusparseStruct->offdiagGPUMatFormat = format;
497:     break;
498:   default:
499:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
500:   }
501:   return 0;
502: }

504: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
505: {
506:   MatCUSPARSEStorageFormat format;
507:   PetscBool                flg;
508:   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
509:   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;

511:   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
512:   if (A->factortype == MAT_FACTOR_NONE) {
513:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
514:     if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format);
515:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg);
516:     if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format);
517:     PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
518:     if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format);
519:   }
520:   PetscOptionsHeadEnd();
521:   return 0;
522: }

524: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
525: {
526:   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
527:   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
528:   PetscObjectState    onnz   = A->nonzerostate;

530:   MatAssemblyEnd_MPIAIJ(A, mode);
531:   if (mpiaij->lvec) VecSetType(mpiaij->lvec, VECSEQCUDA);
532:   if (onnz != A->nonzerostate && cusp->deviceMat) {
533:     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;

535:     PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
536:     PetscNew(&h_mat);
537:     cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost);
538:     cudaFree(h_mat->colmap);
539:     if (h_mat->allocated_indices) {
540:       cudaFree(h_mat->diag.i);
541:       cudaFree(h_mat->diag.j);
542:       if (h_mat->offdiag.j) {
543:         cudaFree(h_mat->offdiag.i);
544:         cudaFree(h_mat->offdiag.j);
545:       }
546:     }
547:     cudaFree(d_mat);
548:     PetscFree(h_mat);
549:     cusp->deviceMat = NULL;
550:   }
551:   return 0;
552: }

554: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
555: {
556:   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
557:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;

560:   if (cusparseStruct->deviceMat) {
561:     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;

563:     PetscInfo(A, "Have device matrix\n");
564:     PetscNew(&h_mat);
565:     cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost);
566:     cudaFree(h_mat->colmap);
567:     if (h_mat->allocated_indices) {
568:       cudaFree(h_mat->diag.i);
569:       cudaFree(h_mat->diag.j);
570:       if (h_mat->offdiag.j) {
571:         cudaFree(h_mat->offdiag.i);
572:         cudaFree(h_mat->offdiag.j);
573:       }
574:     }
575:     cudaFree(d_mat);
576:     PetscFree(h_mat);
577:   }
578:   /* Free COO */
579:   MatResetPreallocationCOO_MPIAIJCUSPARSE(A);
580:   delete cusparseStruct;
581:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
582:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
583:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
584:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
585:   PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL);
586:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL);
587:   MatDestroy_MPIAIJ(A);
588:   return 0;
589: }

591: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
592: {
593:   Mat_MPIAIJ *a;
594:   Mat         A;

596:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
597:   if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B, MAT_COPY_VALUES, newmat);
598:   else if (reuse == MAT_REUSE_MATRIX) MatCopy(B, *newmat, SAME_NONZERO_PATTERN);
599:   A             = *newmat;
600:   A->boundtocpu = PETSC_FALSE;
601:   PetscFree(A->defaultvectype);
602:   PetscStrallocpy(VECCUDA, &A->defaultvectype);

604:   a = (Mat_MPIAIJ *)A->data;
605:   if (a->A) MatSetType(a->A, MATSEQAIJCUSPARSE);
606:   if (a->B) MatSetType(a->B, MATSEQAIJCUSPARSE);
607:   if (a->lvec) VecSetType(a->lvec, VECSEQCUDA);

609:   if (reuse != MAT_REUSE_MATRIX && !a->spptr) a->spptr = new Mat_MPIAIJCUSPARSE;

611:   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
612:   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
613:   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
614:   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
615:   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
616:   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
617:   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
618:   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;

620:   PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE);
621:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
622:   PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
623:   PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
624:   PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE);
625:   PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE);
626: #if defined(PETSC_HAVE_HYPRE)
627:   PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE);
628: #endif
629:   return 0;
630: }

632: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
633: {
634:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
635:   MatCreate_MPIAIJ(A);
636:   MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A);
637:   return 0;
638: }

640: /*@
641:    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
642:    (the default parallel PETSc format).  This matrix will ultimately pushed down
643:    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
644:    assembly performance the user should preallocate the matrix storage by setting
645:    the parameter nz (or the array nnz).  By setting these parameters accurately,
646:    performance during matrix assembly can be increased by more than a factor of 50.

648:    Collective

650:    Input Parameters:
651: +  comm - MPI communicator, set to `PETSC_COMM_SELF`
652: .  m - number of rows
653: .  n - number of columns
654: .  nz - number of nonzeros per row (same for all rows)
655: -  nnz - array containing the number of nonzeros in the various rows
656:          (possibly different for each row) or NULL

658:    Output Parameter:
659: .  A - the matrix

661:    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
662:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
663:    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]

665:    Notes:
666:    If nnz is given then nz is ignored

668:    The AIJ format, also called the
669:    compressed row storage), is fully compatible with standard Fortran 77
670:    storage.  That is, the stored row and column indices can begin at
671:    either one (as in Fortran) or zero.  See the users' manual for details.

673:    Specify the preallocated storage with either nz or nnz (not both).
674:    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
675:    allocation.  For large problems you MUST preallocate memory or you
676:    will get TERRIBLE performance, see the users' manual chapter on matrices.

678:    By default, this format uses inodes (identical nodes) when possible, to
679:    improve numerical efficiency of matrix-vector products and solves. We
680:    search for consecutive rows with the same nonzero structure, thereby
681:    reusing matrix information to achieve increased efficiency.

683:    Level: intermediate

685: .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
686: @*/
687: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
688: {
689:   PetscMPIInt size;

691:   MatCreate(comm, A);
692:   MatSetSizes(*A, m, n, M, N);
693:   MPI_Comm_size(comm, &size);
694:   if (size > 1) {
695:     MatSetType(*A, MATMPIAIJCUSPARSE);
696:     MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
697:   } else {
698:     MatSetType(*A, MATSEQAIJCUSPARSE);
699:     MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
700:   }
701:   return 0;
702: }

704: /*MC
705:    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.

707:    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
708:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
709:    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.

711:    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
712:    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
713:    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
714:    for communicators controlling multiple processes.  It is recommended that you call both of
715:    the above preallocation routines for simplicity.

717:    Options Database Keys:
718: +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
719: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
720: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
721: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).

723:   Level: beginner

725:  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
726: M*/

728: /*MC
729:    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.

731:   Level: beginner

733:  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
734: M*/

736: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
737: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
738: {
739:   PetscSplitCSRDataStructure d_mat;
740:   PetscMPIInt                size;
741:   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
742:   PetscScalar               *aa = NULL, *ba = NULL;
743:   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
744:   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
745:   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;

748:   if (A->factortype != MAT_FACTOR_NONE) {
749:     *B = NULL;
750:     return 0;
751:   }
752:   MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
753:   // get jaca
754:   if (size == 1) {
755:     PetscBool isseqaij;

757:     PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij);
758:     if (isseqaij) {
759:       jaca = (Mat_SeqAIJ *)A->data;
761:       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
762:       d_mat           = cusparsestructA->deviceMat;
763:       MatSeqAIJCUSPARSECopyToGPU(A);
764:     } else {
765:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
767:       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
768:       jaca                      = (Mat_SeqAIJ *)aij->A->data;
769:       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
770:       d_mat                     = spptr->deviceMat;
771:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
772:     }
773:     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
774:       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
776:       matrixA = (CsrMatrix *)matstruct->mat;
777:       bi      = NULL;
778:       bj      = NULL;
779:       ba      = NULL;
780:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
781:   } else {
782:     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
784:     jaca                      = (Mat_SeqAIJ *)aij->A->data;
785:     jacb                      = (Mat_SeqAIJ *)aij->B->data;
786:     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;

789:     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
790:     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
792:     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
793:       MatSeqAIJCUSPARSECopyToGPU(aij->A);
794:       MatSeqAIJCUSPARSECopyToGPU(aij->B);
795:       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
796:       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
799:       matrixA = (CsrMatrix *)matstructA->mat;
800:       matrixB = (CsrMatrix *)matstructB->mat;
801:       if (jacb->compressedrow.use) {
802:         if (!cusparsestructB->rowoffsets_gpu) {
803:           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
804:           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
805:         }
806:         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
807:       } else {
808:         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
809:       }
810:       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
811:       ba = thrust::raw_pointer_cast(matrixB->values->data());
812:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
813:     d_mat = spptr->deviceMat;
814:   }
815:   if (jaca->compressedrow.use) {
816:     if (!cusparsestructA->rowoffsets_gpu) {
817:       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
818:       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
819:     }
820:     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
821:   } else {
822:     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
823:   }
824:   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
825:   aa = thrust::raw_pointer_cast(matrixA->values->data());

827:   if (!d_mat) {
828:     PetscSplitCSRDataStructure h_mat;

830:     // create and populate strucy on host and copy on device
831:     PetscInfo(A, "Create device matrix\n");
832:     PetscNew(&h_mat);
833:     cudaMalloc((void **)&d_mat, sizeof(*d_mat));
834:     if (size > 1) { /* need the colmap array */
835:       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
836:       PetscInt   *colmap;
837:       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;


841:       PetscCalloc1(N + 1, &colmap);
842:       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
843: #if defined(PETSC_USE_64BIT_INDICES)
844:       { // have to make a long version of these
845:         int      *h_bi32, *h_bj32;
846:         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
847:         PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64);
848:         cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost);
849:         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
850:         cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost);
851:         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];

853:         cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64));
854:         cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice);
855:         cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64));
856:         cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice);

858:         h_mat->offdiag.i         = d_bi64;
859:         h_mat->offdiag.j         = d_bj64;
860:         h_mat->allocated_indices = PETSC_TRUE;

862:         PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64);
863:       }
864: #else
865:       h_mat->offdiag.i         = (PetscInt *)bi;
866:       h_mat->offdiag.j         = (PetscInt *)bj;
867:       h_mat->allocated_indices = PETSC_FALSE;
868: #endif
869:       h_mat->offdiag.a = ba;
870:       h_mat->offdiag.n = A->rmap->n;

872:       cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap));
873:       cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice);
874:       PetscFree(colmap);
875:     }
876:     h_mat->rstart = A->rmap->rstart;
877:     h_mat->rend   = A->rmap->rend;
878:     h_mat->cstart = A->cmap->rstart;
879:     h_mat->cend   = A->cmap->rend;
880:     h_mat->M      = A->cmap->N;
881: #if defined(PETSC_USE_64BIT_INDICES)
882:     {
883:       int      *h_ai32, *h_aj32;
884:       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;

886:       static_assert(sizeof(PetscInt) == 8, "");
887:       PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64);
888:       cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost);
889:       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
890:       cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost);
891:       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];

893:       cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64));
894:       cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice);
895:       cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64));
896:       cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice);

898:       h_mat->diag.i            = d_ai64;
899:       h_mat->diag.j            = d_aj64;
900:       h_mat->allocated_indices = PETSC_TRUE;

902:       PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64);
903:     }
904: #else
905:     h_mat->diag.i            = (PetscInt *)ai;
906:     h_mat->diag.j            = (PetscInt *)aj;
907:     h_mat->allocated_indices = PETSC_FALSE;
908: #endif
909:     h_mat->diag.a = aa;
910:     h_mat->diag.n = A->rmap->n;
911:     h_mat->rank   = PetscGlobalRank;
912:     // copy pointers and metadata to device
913:     cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice);
914:     PetscFree(h_mat);
915:   } else {
916:     PetscInfo(A, "Reusing device matrix\n");
917:   }
918:   *B             = d_mat;
919:   A->offloadmask = PETSC_OFFLOAD_GPU;
920:   return 0;
921: }