Actual source code: mpiaijcusparse.cu
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
4: #include <petscconf.h>
5: #include <../src/mat/impls/aij/mpi/mpiaij.h>
6: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
7: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
8: #include <thrust/advance.h>
9: #include <thrust/partition.h>
10: #include <thrust/sort.h>
11: #include <thrust/unique.h>
12: #include <petscsf.h>
14: struct VecCUDAEquals {
15: template <typename Tuple>
16: __host__ __device__ void operator()(Tuple t)
17: {
18: thrust::get<1>(t) = thrust::get<0>(t);
19: }
20: };
22: static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
23: {
24: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
25: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
27: if (!cusparseStruct) return 0;
28: if (cusparseStruct->use_extended_coo) {
29: cudaFree(cusparseStruct->Ajmap1_d);
30: cudaFree(cusparseStruct->Aperm1_d);
31: cudaFree(cusparseStruct->Bjmap1_d);
32: cudaFree(cusparseStruct->Bperm1_d);
33: cudaFree(cusparseStruct->Aimap2_d);
34: cudaFree(cusparseStruct->Ajmap2_d);
35: cudaFree(cusparseStruct->Aperm2_d);
36: cudaFree(cusparseStruct->Bimap2_d);
37: cudaFree(cusparseStruct->Bjmap2_d);
38: cudaFree(cusparseStruct->Bperm2_d);
39: cudaFree(cusparseStruct->Cperm1_d);
40: cudaFree(cusparseStruct->sendbuf_d);
41: cudaFree(cusparseStruct->recvbuf_d);
42: }
43: cusparseStruct->use_extended_coo = PETSC_FALSE;
44: delete cusparseStruct->coo_p;
45: delete cusparseStruct->coo_pw;
46: cusparseStruct->coo_p = NULL;
47: cusparseStruct->coo_pw = NULL;
48: return 0;
49: }
51: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
52: {
53: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
54: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
55: PetscInt n = cusp->coo_nd + cusp->coo_no;
57: if (cusp->coo_p && v) {
58: thrust::device_ptr<const PetscScalar> d_v;
59: THRUSTARRAY *w = NULL;
61: if (isCudaMem(v)) {
62: d_v = thrust::device_pointer_cast(v);
63: } else {
64: w = new THRUSTARRAY(n);
65: w->assign(v, v + n);
66: PetscLogCpuToGpu(n * sizeof(PetscScalar));
67: d_v = w->data();
68: }
70: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
71: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
72: PetscLogGpuTimeBegin();
73: thrust::for_each(zibit, zieit, VecCUDAEquals());
74: PetscLogGpuTimeEnd();
75: delete w;
76: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode);
77: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode);
78: } else {
79: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode);
80: MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode);
81: }
82: return 0;
83: }
85: template <typename Tuple>
86: struct IsNotOffDiagT {
87: PetscInt _cstart, _cend;
89: IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
90: __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
91: };
93: struct IsOffDiag {
94: PetscInt _cstart, _cend;
96: IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
97: __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
98: };
100: struct GlobToLoc {
101: PetscInt _start;
103: GlobToLoc(PetscInt start) : _start(start) { }
104: __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
105: };
107: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
108: {
109: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
110: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
111: PetscInt N, *jj;
112: size_t noff = 0;
113: THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
114: THRUSTINTARRAY d_j(n);
115: ISLocalToGlobalMapping l2g;
117: MatDestroy(&b->A);
118: MatDestroy(&b->B);
120: PetscLogCpuToGpu(2. * n * sizeof(PetscInt));
121: d_i.assign(coo_i, coo_i + n);
122: d_j.assign(coo_j, coo_j + n);
123: PetscLogGpuTimeBegin();
124: auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
125: auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
126: if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
127: cusp->coo_p = new THRUSTINTARRAY(n);
128: cusp->coo_pw = new THRUSTARRAY(n);
129: thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
130: auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
131: auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
132: auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
133: firstoffd = mzipp.get_iterator_tuple().get<1>();
134: }
135: cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
136: cusp->coo_no = thrust::distance(firstoffd, d_j.end());
138: /* from global to local */
139: thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
140: thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
141: PetscLogGpuTimeEnd();
143: /* copy offdiag column indices to map on the CPU */
144: PetscMalloc1(cusp->coo_no, &jj); /* jj[] will store compacted col ids of the offdiag part */
145: cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost);
146: auto o_j = d_j.begin();
147: PetscLogGpuTimeBegin();
148: thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
149: thrust::sort(thrust::device, o_j, d_j.end());
150: auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
151: PetscLogGpuTimeEnd();
152: noff = thrust::distance(o_j, wit);
153: /* allocate the garray, the columns of B will not be mapped in the multiply setup */
154: PetscMalloc1(noff, &b->garray);
155: cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost);
156: PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt));
157: ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g);
158: ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH);
159: ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj);
161: ISLocalToGlobalMappingDestroy(&l2g);
163: MatCreate(PETSC_COMM_SELF, &b->A);
164: MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
165: MatSetType(b->A, MATSEQAIJCUSPARSE);
166: MatCreate(PETSC_COMM_SELF, &b->B);
167: MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff);
168: MatSetType(b->B, MATSEQAIJCUSPARSE);
170: /* GPU memory, cusparse specific call handles it internally */
171: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get());
172: MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj);
173: PetscFree(jj);
175: MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat);
176: MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat);
178: MatBindToCPU(b->A, B->boundtocpu);
179: MatBindToCPU(b->B, B->boundtocpu);
180: MatSetUpMultiply_MPIAIJ(B);
181: return 0;
182: }
184: static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
185: {
186: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
187: Mat_MPIAIJCUSPARSE *mpidev;
188: PetscBool coo_basic = PETSC_TRUE;
189: PetscMemType mtype = PETSC_MEMTYPE_DEVICE;
190: PetscInt rstart, rend;
192: PetscFree(mpiaij->garray);
193: VecDestroy(&mpiaij->lvec);
194: #if defined(PETSC_USE_CTABLE)
195: PetscTableDestroy(&mpiaij->colmap);
196: #else
197: PetscFree(mpiaij->colmap);
198: #endif
199: VecScatterDestroy(&mpiaij->Mvctx);
200: mat->assembled = PETSC_FALSE;
201: mat->was_assembled = PETSC_FALSE;
202: MatResetPreallocationCOO_MPIAIJ(mat);
203: MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);
204: if (coo_i) {
205: PetscLayoutGetRange(mat->rmap, &rstart, &rend);
206: PetscGetMemType(coo_i, &mtype);
207: if (PetscMemTypeHost(mtype)) {
208: for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
209: if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
210: coo_basic = PETSC_FALSE;
211: break;
212: }
213: }
214: }
215: }
216: /* All ranks must agree on the value of coo_basic */
217: MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat));
218: if (coo_basic) {
219: MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j);
220: } else {
221: MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j);
222: mat->offloadmask = PETSC_OFFLOAD_CPU;
223: /* creates the GPU memory */
224: MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);
225: MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);
226: mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
227: mpidev->use_extended_coo = PETSC_TRUE;
229: cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount));
230: cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount));
232: cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount));
233: cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount));
235: cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount));
236: cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount));
237: cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount));
239: cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount));
240: cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount));
241: cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount));
243: cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount));
244: cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar));
245: cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar));
247: cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
248: cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice);
250: cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
251: cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice);
253: cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
254: cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
255: cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
257: cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
258: cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice);
259: cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice);
261: cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice);
262: }
263: return 0;
264: }
266: __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
267: {
268: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
269: const PetscCount grid_size = gridDim.x * blockDim.x;
270: for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
271: }
273: __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
274: {
275: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
276: const PetscCount grid_size = gridDim.x * blockDim.x;
277: for (; i < Annz + Bnnz; i += grid_size) {
278: PetscScalar sum = 0.0;
279: if (i < Annz) {
280: for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
281: Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
282: } else {
283: i -= Annz;
284: for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
285: Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
286: }
287: }
288: }
290: __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
291: {
292: PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
293: const PetscCount grid_size = gridDim.x * blockDim.x;
294: for (; i < Annz2 + Bnnz2; i += grid_size) {
295: if (i < Annz2) {
296: for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
297: } else {
298: i -= Annz2;
299: for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
300: }
301: }
302: }
304: static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
305: {
306: Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
307: Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
308: Mat A = mpiaij->A, B = mpiaij->B;
309: PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
310: PetscScalar *Aa, *Ba = NULL;
311: PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
312: const PetscScalar *v1 = v;
313: const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
314: const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
315: const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
316: const PetscCount *Cperm1 = mpidev->Cperm1_d;
317: PetscMemType memtype;
319: if (mpidev->use_extended_coo) {
320: PetscMPIInt size;
322: MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size);
323: PetscGetMemType(v, &memtype);
324: if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
325: cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar));
326: cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
327: }
329: if (imode == INSERT_VALUES) {
330: MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa); /* write matrix values */
331: MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba);
332: } else {
333: MatSeqAIJCUSPARSEGetArray(A, &Aa); /* read & write matrix values */
334: MatSeqAIJCUSPARSEGetArray(B, &Ba);
335: }
337: /* Pack entries to be sent to remote */
338: if (mpiaij->sendlen) {
339: MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
340: cudaPeekAtLastError();
341: }
343: /* Send remote entries to their owner and overlap the communication with local computation */
344: PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE);
345: /* Add local entries to A and B */
346: if (Annz + Bnnz > 0) {
347: MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
348: cudaPeekAtLastError();
349: }
350: PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE);
352: /* Add received remote entries to A and B */
353: if (Annz2 + Bnnz2 > 0) {
354: MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
355: cudaPeekAtLastError();
356: }
358: if (imode == INSERT_VALUES) {
359: MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa);
360: MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba);
361: } else {
362: MatSeqAIJCUSPARSERestoreArray(A, &Aa);
363: MatSeqAIJCUSPARSERestoreArray(B, &Ba);
364: }
365: if (PetscMemTypeHost(memtype)) cudaFree((void *)v1);
366: } else {
367: MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode);
368: }
369: mat->offloadmask = PETSC_OFFLOAD_GPU;
370: return 0;
371: }
373: static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
374: {
375: Mat Ad, Ao;
376: const PetscInt *cmap;
378: MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap);
379: MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc);
380: if (glob) {
381: PetscInt cst, i, dn, on, *gidx;
383: MatGetLocalSize(Ad, NULL, &dn);
384: MatGetLocalSize(Ao, NULL, &on);
385: MatGetOwnershipRangeColumn(A, &cst, NULL);
386: PetscMalloc1(dn + on, &gidx);
387: for (i = 0; i < dn; i++) gidx[i] = cst + i;
388: for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
389: ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob);
390: }
391: return 0;
392: }
394: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
395: {
396: Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
397: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
398: PetscInt i;
400: PetscLayoutSetUp(B->rmap);
401: PetscLayoutSetUp(B->cmap);
402: if (PetscDefined(USE_DEBUG) && d_nnz) {
404: }
405: if (PetscDefined(USE_DEBUG) && o_nnz) {
407: }
408: #if defined(PETSC_USE_CTABLE)
409: PetscTableDestroy(&b->colmap);
410: #else
411: PetscFree(b->colmap);
412: #endif
413: PetscFree(b->garray);
414: VecDestroy(&b->lvec);
415: VecScatterDestroy(&b->Mvctx);
416: /* Because the B will have been resized we simply destroy it and create a new one each time */
417: MatDestroy(&b->B);
418: if (!b->A) {
419: MatCreate(PETSC_COMM_SELF, &b->A);
420: MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n);
421: }
422: if (!b->B) {
423: PetscMPIInt size;
424: MPI_Comm_size(PetscObjectComm((PetscObject)B), &size);
425: MatCreate(PETSC_COMM_SELF, &b->B);
426: MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0);
427: }
428: MatSetType(b->A, MATSEQAIJCUSPARSE);
429: MatSetType(b->B, MATSEQAIJCUSPARSE);
430: MatBindToCPU(b->A, B->boundtocpu);
431: MatBindToCPU(b->B, B->boundtocpu);
432: MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz);
433: MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz);
434: MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat);
435: MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat);
436: B->preallocated = PETSC_TRUE;
437: return 0;
438: }
440: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
441: {
442: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
444: VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
445: (*a->A->ops->mult)(a->A, xx, yy);
446: VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
447: (*a->B->ops->multadd)(a->B, a->lvec, yy, yy);
448: return 0;
449: }
451: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
452: {
453: Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
455: MatZeroEntries(l->A);
456: MatZeroEntries(l->B);
457: return 0;
458: }
460: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
461: {
462: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
464: VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
465: (*a->A->ops->multadd)(a->A, xx, yy, zz);
466: VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD);
467: (*a->B->ops->multadd)(a->B, a->lvec, zz, zz);
468: return 0;
469: }
471: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
472: {
473: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
475: (*a->B->ops->multtranspose)(a->B, xx, a->lvec);
476: (*a->A->ops->multtranspose)(a->A, xx, yy);
477: VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
478: VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE);
479: return 0;
480: }
482: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
483: {
484: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
485: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
487: switch (op) {
488: case MAT_CUSPARSE_MULT_DIAG:
489: cusparseStruct->diagGPUMatFormat = format;
490: break;
491: case MAT_CUSPARSE_MULT_OFFDIAG:
492: cusparseStruct->offdiagGPUMatFormat = format;
493: break;
494: case MAT_CUSPARSE_ALL:
495: cusparseStruct->diagGPUMatFormat = format;
496: cusparseStruct->offdiagGPUMatFormat = format;
497: break;
498: default:
499: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
500: }
501: return 0;
502: }
504: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
505: {
506: MatCUSPARSEStorageFormat format;
507: PetscBool flg;
508: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
509: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
511: PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
512: if (A->factortype == MAT_FACTOR_NONE) {
513: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
514: if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format);
515: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg);
516: if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format);
517: PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg);
518: if (flg) MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format);
519: }
520: PetscOptionsHeadEnd();
521: return 0;
522: }
524: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
525: {
526: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
527: Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
528: PetscObjectState onnz = A->nonzerostate;
530: MatAssemblyEnd_MPIAIJ(A, mode);
531: if (mpiaij->lvec) VecSetType(mpiaij->lvec, VECSEQCUDA);
532: if (onnz != A->nonzerostate && cusp->deviceMat) {
533: PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
535: PetscInfo(A, "Destroy device mat since nonzerostate changed\n");
536: PetscNew(&h_mat);
537: cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost);
538: cudaFree(h_mat->colmap);
539: if (h_mat->allocated_indices) {
540: cudaFree(h_mat->diag.i);
541: cudaFree(h_mat->diag.j);
542: if (h_mat->offdiag.j) {
543: cudaFree(h_mat->offdiag.i);
544: cudaFree(h_mat->offdiag.j);
545: }
546: }
547: cudaFree(d_mat);
548: PetscFree(h_mat);
549: cusp->deviceMat = NULL;
550: }
551: return 0;
552: }
554: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
555: {
556: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
557: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
560: if (cusparseStruct->deviceMat) {
561: PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
563: PetscInfo(A, "Have device matrix\n");
564: PetscNew(&h_mat);
565: cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost);
566: cudaFree(h_mat->colmap);
567: if (h_mat->allocated_indices) {
568: cudaFree(h_mat->diag.i);
569: cudaFree(h_mat->diag.j);
570: if (h_mat->offdiag.j) {
571: cudaFree(h_mat->offdiag.i);
572: cudaFree(h_mat->offdiag.j);
573: }
574: }
575: cudaFree(d_mat);
576: PetscFree(h_mat);
577: }
578: /* Free COO */
579: MatResetPreallocationCOO_MPIAIJCUSPARSE(A);
580: delete cusparseStruct;
581: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL);
582: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL);
583: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL);
584: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL);
585: PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL);
586: PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL);
587: MatDestroy_MPIAIJ(A);
588: return 0;
589: }
591: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
592: {
593: Mat_MPIAIJ *a;
594: Mat A;
596: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
597: if (reuse == MAT_INITIAL_MATRIX) MatDuplicate(B, MAT_COPY_VALUES, newmat);
598: else if (reuse == MAT_REUSE_MATRIX) MatCopy(B, *newmat, SAME_NONZERO_PATTERN);
599: A = *newmat;
600: A->boundtocpu = PETSC_FALSE;
601: PetscFree(A->defaultvectype);
602: PetscStrallocpy(VECCUDA, &A->defaultvectype);
604: a = (Mat_MPIAIJ *)A->data;
605: if (a->A) MatSetType(a->A, MATSEQAIJCUSPARSE);
606: if (a->B) MatSetType(a->B, MATSEQAIJCUSPARSE);
607: if (a->lvec) VecSetType(a->lvec, VECSEQCUDA);
609: if (reuse != MAT_REUSE_MATRIX && !a->spptr) a->spptr = new Mat_MPIAIJCUSPARSE;
611: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
612: A->ops->mult = MatMult_MPIAIJCUSPARSE;
613: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
614: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
615: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
616: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
617: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
618: A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
620: PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE);
621: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);
622: PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
623: PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);
624: PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE);
625: PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE);
626: #if defined(PETSC_HAVE_HYPRE)
627: PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE);
628: #endif
629: return 0;
630: }
632: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
633: {
634: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
635: MatCreate_MPIAIJ(A);
636: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A);
637: return 0;
638: }
640: /*@
641: MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
642: (the default parallel PETSc format). This matrix will ultimately pushed down
643: to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
644: assembly performance the user should preallocate the matrix storage by setting
645: the parameter nz (or the array nnz). By setting these parameters accurately,
646: performance during matrix assembly can be increased by more than a factor of 50.
648: Collective
650: Input Parameters:
651: + comm - MPI communicator, set to `PETSC_COMM_SELF`
652: . m - number of rows
653: . n - number of columns
654: . nz - number of nonzeros per row (same for all rows)
655: - nnz - array containing the number of nonzeros in the various rows
656: (possibly different for each row) or NULL
658: Output Parameter:
659: . A - the matrix
661: It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
662: MatXXXXSetPreallocation() paradigm instead of this routine directly.
663: [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
665: Notes:
666: If nnz is given then nz is ignored
668: The AIJ format, also called the
669: compressed row storage), is fully compatible with standard Fortran 77
670: storage. That is, the stored row and column indices can begin at
671: either one (as in Fortran) or zero. See the users' manual for details.
673: Specify the preallocated storage with either nz or nnz (not both).
674: Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
675: allocation. For large problems you MUST preallocate memory or you
676: will get TERRIBLE performance, see the users' manual chapter on matrices.
678: By default, this format uses inodes (identical nodes) when possible, to
679: improve numerical efficiency of matrix-vector products and solves. We
680: search for consecutive rows with the same nonzero structure, thereby
681: reusing matrix information to achieve increased efficiency.
683: Level: intermediate
685: .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
686: @*/
687: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
688: {
689: PetscMPIInt size;
691: MatCreate(comm, A);
692: MatSetSizes(*A, m, n, M, N);
693: MPI_Comm_size(comm, &size);
694: if (size > 1) {
695: MatSetType(*A, MATMPIAIJCUSPARSE);
696: MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz);
697: } else {
698: MatSetType(*A, MATSEQAIJCUSPARSE);
699: MatSeqAIJSetPreallocation(*A, d_nz, d_nnz);
700: }
701: return 0;
702: }
704: /*MC
705: MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
707: A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
708: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
709: All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
711: This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
712: and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators,
713: `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
714: for communicators controlling multiple processes. It is recommended that you call both of
715: the above preallocation routines for simplicity.
717: Options Database Keys:
718: + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
719: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
720: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
721: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
723: Level: beginner
725: .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
726: M*/
728: /*MC
729: MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
731: Level: beginner
733: .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
734: M*/
736: // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
737: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
738: {
739: PetscSplitCSRDataStructure d_mat;
740: PetscMPIInt size;
741: int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
742: PetscScalar *aa = NULL, *ba = NULL;
743: Mat_SeqAIJ *jaca = NULL, *jacb = NULL;
744: Mat_SeqAIJCUSPARSE *cusparsestructA = NULL;
745: CsrMatrix *matrixA = NULL, *matrixB = NULL;
748: if (A->factortype != MAT_FACTOR_NONE) {
749: *B = NULL;
750: return 0;
751: }
752: MPI_Comm_size(PetscObjectComm((PetscObject)A), &size);
753: // get jaca
754: if (size == 1) {
755: PetscBool isseqaij;
757: PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij);
758: if (isseqaij) {
759: jaca = (Mat_SeqAIJ *)A->data;
761: cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
762: d_mat = cusparsestructA->deviceMat;
763: MatSeqAIJCUSPARSECopyToGPU(A);
764: } else {
765: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
767: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
768: jaca = (Mat_SeqAIJ *)aij->A->data;
769: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
770: d_mat = spptr->deviceMat;
771: MatSeqAIJCUSPARSECopyToGPU(aij->A);
772: }
773: if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
774: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
776: matrixA = (CsrMatrix *)matstruct->mat;
777: bi = NULL;
778: bj = NULL;
779: ba = NULL;
780: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
781: } else {
782: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
784: jaca = (Mat_SeqAIJ *)aij->A->data;
785: jacb = (Mat_SeqAIJ *)aij->B->data;
786: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
789: cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
790: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
792: if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
793: MatSeqAIJCUSPARSECopyToGPU(aij->A);
794: MatSeqAIJCUSPARSECopyToGPU(aij->B);
795: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
796: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
799: matrixA = (CsrMatrix *)matstructA->mat;
800: matrixB = (CsrMatrix *)matstructB->mat;
801: if (jacb->compressedrow.use) {
802: if (!cusparsestructB->rowoffsets_gpu) {
803: cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
804: cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
805: }
806: bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
807: } else {
808: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
809: }
810: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
811: ba = thrust::raw_pointer_cast(matrixB->values->data());
812: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
813: d_mat = spptr->deviceMat;
814: }
815: if (jaca->compressedrow.use) {
816: if (!cusparsestructA->rowoffsets_gpu) {
817: cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
818: cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
819: }
820: ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
821: } else {
822: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
823: }
824: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
825: aa = thrust::raw_pointer_cast(matrixA->values->data());
827: if (!d_mat) {
828: PetscSplitCSRDataStructure h_mat;
830: // create and populate strucy on host and copy on device
831: PetscInfo(A, "Create device matrix\n");
832: PetscNew(&h_mat);
833: cudaMalloc((void **)&d_mat, sizeof(*d_mat));
834: if (size > 1) { /* need the colmap array */
835: Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
836: PetscInt *colmap;
837: PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N;
841: PetscCalloc1(N + 1, &colmap);
842: for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
843: #if defined(PETSC_USE_64BIT_INDICES)
844: { // have to make a long version of these
845: int *h_bi32, *h_bj32;
846: PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
847: PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64);
848: cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost);
849: for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
850: cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost);
851: for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
853: cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64));
854: cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice);
855: cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64));
856: cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice);
858: h_mat->offdiag.i = d_bi64;
859: h_mat->offdiag.j = d_bj64;
860: h_mat->allocated_indices = PETSC_TRUE;
862: PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64);
863: }
864: #else
865: h_mat->offdiag.i = (PetscInt *)bi;
866: h_mat->offdiag.j = (PetscInt *)bj;
867: h_mat->allocated_indices = PETSC_FALSE;
868: #endif
869: h_mat->offdiag.a = ba;
870: h_mat->offdiag.n = A->rmap->n;
872: cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap));
873: cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice);
874: PetscFree(colmap);
875: }
876: h_mat->rstart = A->rmap->rstart;
877: h_mat->rend = A->rmap->rend;
878: h_mat->cstart = A->cmap->rstart;
879: h_mat->cend = A->cmap->rend;
880: h_mat->M = A->cmap->N;
881: #if defined(PETSC_USE_64BIT_INDICES)
882: {
883: int *h_ai32, *h_aj32;
884: PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
886: static_assert(sizeof(PetscInt) == 8, "");
887: PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64);
888: cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost);
889: for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
890: cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost);
891: for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
893: cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64));
894: cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice);
895: cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64));
896: cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice);
898: h_mat->diag.i = d_ai64;
899: h_mat->diag.j = d_aj64;
900: h_mat->allocated_indices = PETSC_TRUE;
902: PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64);
903: }
904: #else
905: h_mat->diag.i = (PetscInt *)ai;
906: h_mat->diag.j = (PetscInt *)aj;
907: h_mat->allocated_indices = PETSC_FALSE;
908: #endif
909: h_mat->diag.a = aa;
910: h_mat->diag.n = A->rmap->n;
911: h_mat->rank = PetscGlobalRank;
912: // copy pointers and metadata to device
913: cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice);
914: PetscFree(h_mat);
915: } else {
916: PetscInfo(A, "Reusing device matrix\n");
917: }
918: *B = d_mat;
919: A->offloadmask = PETSC_OFFLOAD_GPU;
920: return 0;
921: }