Actual source code: densecuda.cu

  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsc/private/cudavecimpl.h>
  8: #include <thrust/device_ptr.h>
  9: #include <thrust/functional.h>
 10: #include <thrust/iterator/counting_iterator.h>
 11: #include <thrust/iterator/transform_iterator.h>
 12: #include <thrust/iterator/permutation_iterator.h>
 13: #include <thrust/transform.h>
 14: #include <thrust/device_vector.h>

 16: #if defined(PETSC_USE_COMPLEX)
 17:   #if defined(PETSC_USE_REAL_SINGLE)
 18:     #define cusolverDnXpotrf(a, b, c, d, e, f, g, h)                        cusolverDnCpotrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
 19:     #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   cusolverDnCpotrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
 20:     #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i)                     cusolverDnCpotrs((a), (b), (c), (d), (cuComplex *)(e), (f), (cuComplex *)(g), (h), (i))
 21:     #define cusolverDnXpotri(a, b, c, d, e, f, g, h)                        cusolverDnCpotri((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
 22:     #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f)                   cusolverDnCpotri_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
 23:     #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     cusolverDnCsytrf((a), (b), (c), (cuComplex *)(d), (e), (f), (cuComplex *)(g), (h), (i))
 24:     #define cusolverDnXsytrf_bufferSize(a, b, c, d, e)                      cusolverDnCsytrf_bufferSize((a), (b), (cuComplex *)(c), (d), (e))
 25:     #define cusolverDnXgetrf(a, b, c, d, e, f, g, h)                        cusolverDnCgetrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
 26:     #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   cusolverDnCgetrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
 27:     #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j)                  cusolverDnCgetrs((a), (b), (c), (d), (cuComplex *)(e), (f), (g), (cuComplex *)(h), (i), (j))
 28:     #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   cusolverDnCgeqrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
 29:     #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     cusolverDnCgeqrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (cuComplex *)(g), (h), (i))
 30:     #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnCunmqr_bufferSize((a), (b), (c), (d), (e), (f), (cuComplex *)(g), (h), (cuComplex *)(i), (cuComplex *)(j), (k), (l))
 31:     #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      cusolverDnCunmqr((a), (b), (c), (d), (e), (f), (cuComplex *)(g), (h), (cuComplex *)(i), (cuComplex *)(j), (k), (cuComplex *)(l), (m), (n))
 32:     #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 cublasCtrsm((a), (b), (c), (d), (e), (f), (g), (cuComplex *)(h), (cuComplex *)(i), (j), (cuComplex *)(k), (l))
 33:   #else /* complex double */
 34:     #define cusolverDnXpotrf(a, b, c, d, e, f, g, h)                        cusolverDnZpotrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
 35:     #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   cusolverDnZpotrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
 36:     #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i)                     cusolverDnZpotrs((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (cuDoubleComplex *)(g), (h), (i))
 37:     #define cusolverDnXpotri(a, b, c, d, e, f, g, h)                        cusolverDnZpotri((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
 38:     #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f)                   cusolverDnZpotri_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
 39:     #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     cusolverDnZsytrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (f), (cuDoubleComplex *)(g), (h), (i))
 40:     #define cusolverDnXsytrf_bufferSize(a, b, c, d, e)                      cusolverDnZsytrf_bufferSize((a), (b), (cuDoubleComplex *)(c), (d), (e))
 41:     #define cusolverDnXgetrf(a, b, c, d, e, f, g, h)                        cusolverDnZgetrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
 42:     #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   cusolverDnZgetrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
 43:     #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j)                  cusolverDnZgetrs((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (g), (cuDoubleComplex *)(h), (i), (j))
 44:     #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   cusolverDnZgeqrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
 45:     #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     cusolverDnZgeqrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (cuDoubleComplex *)(g), (h), (i))
 46:     #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnZunmqr_bufferSize((a), (b), (c), (d), (e), (f), (cuDoubleComplex *)(g), (h), (cuDoubleComplex *)(i), (cuDoubleComplex *)(j), (k), (l))
 47:     #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      cusolverDnZunmqr((a), (b), (c), (d), (e), (f), (cuDoubleComplex *)(g), (h), (cuDoubleComplex *)(i), (cuDoubleComplex *)(j), (k), (cuDoubleComplex *)(l), (m), (n))
 48:     #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 cublasZtrsm((a), (b), (c), (d), (e), (f), (g), (cuDoubleComplex *)(h), (cuDoubleComplex *)(i), (j), (cuDoubleComplex *)(k), (l))
 49:   #endif
 50: #else /* real single */
 51:   #if defined(PETSC_USE_REAL_SINGLE)
 52:     #define cusolverDnXpotrf(a, b, c, d, e, f, g, h)                        cusolverDnSpotrf((a), (b), (c), (d), (e), (f), (g), (h))
 53:     #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   cusolverDnSpotrf_bufferSize((a), (b), (c), (d), (e), (f))
 54:     #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i)                     cusolverDnSpotrs((a), (b), (c), (d), (e), (f), (g), (h), (i))
 55:     #define cusolverDnXpotri(a, b, c, d, e, f, g, h)                        cusolverDnSpotri((a), (b), (c), (d), (e), (f), (g), (h))
 56:     #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f)                   cusolverDnSpotri_bufferSize((a), (b), (c), (d), (e), (f))
 57:     #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     cusolverDnSsytrf((a), (b), (c), (d), (e), (f), (g), (h), (i))
 58:     #define cusolverDnXsytrf_bufferSize(a, b, c, d, e)                      cusolverDnSsytrf_bufferSize((a), (b), (c), (d), (e))
 59:     #define cusolverDnXgetrf(a, b, c, d, e, f, g, h)                        cusolverDnSgetrf((a), (b), (c), (d), (e), (f), (g), (h))
 60:     #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   cusolverDnSgetrf_bufferSize((a), (b), (c), (d), (e), (f))
 61:     #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j)                  cusolverDnSgetrs((a), (b), (c), (d), (e), (f), (g), (h), (i), (j))
 62:     #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   cusolverDnSgeqrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
 63:     #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     cusolverDnSgeqrf((a), (b), (c), (float *)(d), (e), (float *)(f), (float *)(g), (h), (i))
 64:     #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnSormqr_bufferSize((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (l))
 65:     #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      cusolverDnSormqr((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (float *)(l), (m), (n))
 66:     #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 cublasStrsm((a), (b), (c), (d), (e), (f), (g), (float *)(h), (float *)(i), (j), (float *)(k), (l))
 67:   #else /* real double */
 68:     #define cusolverDnXpotrf(a, b, c, d, e, f, g, h)                        cusolverDnDpotrf((a), (b), (c), (d), (e), (f), (g), (h))
 69:     #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f)                   cusolverDnDpotrf_bufferSize((a), (b), (c), (d), (e), (f))
 70:     #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i)                     cusolverDnDpotrs((a), (b), (c), (d), (e), (f), (g), (h), (i))
 71:     #define cusolverDnXpotri(a, b, c, d, e, f, g, h)                        cusolverDnDpotri((a), (b), (c), (d), (e), (f), (g), (h))
 72:     #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f)                   cusolverDnDpotri_bufferSize((a), (b), (c), (d), (e), (f))
 73:     #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i)                     cusolverDnDsytrf((a), (b), (c), (d), (e), (f), (g), (h), (i))
 74:     #define cusolverDnXsytrf_bufferSize(a, b, c, d, e)                      cusolverDnDsytrf_bufferSize((a), (b), (c), (d), (e))
 75:     #define cusolverDnXgetrf(a, b, c, d, e, f, g, h)                        cusolverDnDgetrf((a), (b), (c), (d), (e), (f), (g), (h))
 76:     #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f)                   cusolverDnDgetrf_bufferSize((a), (b), (c), (d), (e), (f))
 77:     #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j)                  cusolverDnDgetrs((a), (b), (c), (d), (e), (f), (g), (h), (i), (j))
 78:     #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f)                   cusolverDnDgeqrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
 79:     #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i)                     cusolverDnDgeqrf((a), (b), (c), (double *)(d), (e), (double *)(f), (double *)(g), (h), (i))
 80:     #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnDormqr_bufferSize((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (l))
 81:     #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      cusolverDnDormqr((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (double *)(l), (m), (n))
 82:     #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l)                 cublasDtrsm((a), (b), (c), (d), (e), (f), (g), (double *)(h), (double *)(i), (j), (double *)(k), (l))
 83:   #endif
 84: #endif

 86: typedef struct {
 87:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 88:   PetscBool    user_alloc;
 89:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 90:   PetscBool    unplaced_user_alloc;
 91:   /* factorization support */
 92:   PetscCuBLASInt *d_fact_ipiv; /* device pivots */
 93:   PetscScalar    *d_fact_tau;  /* device QR tau vector */
 94:   PetscScalar    *d_fact_work; /* device workspace */
 95:   PetscCuBLASInt  fact_lwork;
 96:   PetscCuBLASInt *d_fact_info; /* device info */
 97:   /* workspace */
 98:   Vec workvec;
 99: } Mat_SeqDenseCUDA;

101: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
102: {
103:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
104:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
105:   PetscBool         iscuda;

107:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &iscuda);
108:   if (!iscuda) return 0;
109:   PetscLayoutSetUp(A->rmap);
110:   PetscLayoutSetUp(A->cmap);
111:   /* it may happen CPU preallocation has not been performed */
112:   if (cA->lda <= 0) cA->lda = A->rmap->n;
113:   if (!dA->user_alloc) cudaFree(dA->d_v);
114:   if (!d_data) { /* petsc-allocated storage */
115:     size_t sz;

117:     PetscIntMultError(cA->lda, A->cmap->n, NULL);
118:     sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
119:     cudaMalloc((void **)&dA->d_v, sz);
120:     cudaMemset(dA->d_v, 0, sz);
121:     dA->user_alloc = PETSC_FALSE;
122:   } else { /* user-allocated storage */
123:     dA->d_v        = d_data;
124:     dA->user_alloc = PETSC_TRUE;
125:   }
126:   A->offloadmask  = PETSC_OFFLOAD_GPU;
127:   A->preallocated = PETSC_TRUE;
128:   A->assembled    = PETSC_TRUE;
129:   return 0;
130: }

132: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
133: {
134:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
135:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

138:   PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
139:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
140:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
141:       MatSeqDenseSetPreallocation(A, NULL);
142:     }
143:     PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0);
144:     if (cA->lda > A->rmap->n) {
145:       cudaMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyDeviceToHost);
146:     } else {
147:       cudaMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyDeviceToHost);
148:     }
149:     PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
150:     PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0);

152:     A->offloadmask = PETSC_OFFLOAD_BOTH;
153:   }
154:   return 0;
155: }

157: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
158: {
159:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
160:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
161:   PetscBool         copy;

164:   if (A->boundtocpu) return 0;
165:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
166:   PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
167:   if (copy) {
168:     if (!dA->d_v) { /* Allocate GPU memory if not present */
169:       MatSeqDenseCUDASetPreallocation(A, NULL);
170:     }
171:     PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0);
172:     if (cA->lda > A->rmap->n) {
173:       cudaMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyHostToDevice);
174:     } else {
175:       cudaMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyHostToDevice);
176:     }
177:     PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
178:     PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0);

180:     A->offloadmask = PETSC_OFFLOAD_BOTH;
181:   }
182:   return 0;
183: }

185: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A, Mat B, MatStructure str)
186: {
187:   const PetscScalar *va;
188:   PetscScalar       *vb;
189:   PetscInt           lda1, lda2, m = A->rmap->n, n = A->cmap->n;

191:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
192:   if (A->ops->copy != B->ops->copy) {
193:     MatCopy_Basic(A, B, str);
194:     return 0;
195:   }
197:   MatDenseCUDAGetArrayRead(A, &va);
198:   MatDenseCUDAGetArrayWrite(B, &vb);
199:   MatDenseGetLDA(A, &lda1);
200:   MatDenseGetLDA(B, &lda2);
201:   PetscLogGpuTimeBegin();
202:   if (lda1 > m || lda2 > m) {
203:     cudaMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, cudaMemcpyDeviceToDevice);
204:   } else {
205:     cudaMemcpy(vb, va, m * (n * sizeof(PetscScalar)), cudaMemcpyDeviceToDevice);
206:   }
207:   PetscLogGpuTimeEnd();
208:   MatDenseCUDARestoreArrayWrite(B, &vb);
209:   MatDenseCUDARestoreArrayRead(A, &va);
210:   return 0;
211: }

213: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
214: {
215:   PetscScalar *va;
216:   PetscInt     lda, m = A->rmap->n, n = A->cmap->n;

218:   MatDenseCUDAGetArrayWrite(A, &va);
219:   MatDenseGetLDA(A, &lda);
220:   PetscLogGpuTimeBegin();
221:   if (lda > m) {
222:     cudaMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n);
223:   } else {
224:     cudaMemset(va, 0, m * (n * sizeof(PetscScalar)));
225:   }
226:   PetscLogGpuTimeEnd();
227:   MatDenseCUDARestoreArrayWrite(A, &va);
228:   return 0;
229: }

231: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
232: {
233:   Mat_SeqDense     *aa = (Mat_SeqDense *)A->data;
234:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

239:   if (aa->v) MatSeqDenseCUDACopyToGPU(A);
240:   dA->unplacedarray       = dA->d_v;
241:   dA->unplaced_user_alloc = dA->user_alloc;
242:   dA->d_v                 = (PetscScalar *)a;
243:   dA->user_alloc          = PETSC_TRUE;
244:   return 0;
245: }

247: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
248: {
249:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
250:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

254:   if (a->v) MatSeqDenseCUDACopyToGPU(A);
255:   dA->d_v           = dA->unplacedarray;
256:   dA->user_alloc    = dA->unplaced_user_alloc;
257:   dA->unplacedarray = NULL;
258:   return 0;
259: }

261: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
262: {
263:   Mat_SeqDense     *aa = (Mat_SeqDense *)A->data;
264:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

269:   if (!dA->user_alloc) cudaFree(dA->d_v);
270:   dA->d_v        = (PetscScalar *)a;
271:   dA->user_alloc = PETSC_FALSE;
272:   return 0;
273: }

275: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
276: {
277:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

279:   if (!dA->d_v) MatSeqDenseCUDASetPreallocation(A, NULL);
280:   *a = dA->d_v;
281:   return 0;
282: }

284: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
285: {
286:   if (a) *a = NULL;
287:   return 0;
288: }

290: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
291: {
292:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

294:   MatSeqDenseCUDACopyToGPU(A);
295:   *a = dA->d_v;
296:   return 0;
297: }

299: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
300: {
301:   if (a) *a = NULL;
302:   return 0;
303: }

305: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
307:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

309:   MatSeqDenseCUDACopyToGPU(A);
310:   *a = dA->d_v;
311:   return 0;
312: }

314: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
315: {
316:   if (a) *a = NULL;
317:   return 0;
318: }

320: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
321: {
322: #if PETSC_PKG_CUDA_VERSION_GE(10, 1, 0)
323:   Mat_SeqDense      *a  = (Mat_SeqDense *)A->data;
324:   Mat_SeqDenseCUDA  *dA = (Mat_SeqDenseCUDA *)A->spptr;
325:   PetscScalar       *da;
326:   cusolverDnHandle_t handle;
327:   PetscCuBLASInt     n, lda;
328:   #if defined(PETSC_USE_DEBUG)
329:   PetscCuBLASInt info;
330:   #endif

332:   if (!A->rmap->n || !A->cmap->n) return 0;
333:   PetscCUSOLVERDnGetHandle(&handle);
334:   PetscCuBLASIntCast(A->cmap->n, &n);
335:   PetscCuBLASIntCast(a->lda, &lda);
337:   if (A->factortype == MAT_FACTOR_CHOLESKY) {
338:     if (!dA->d_fact_ipiv) { /* spd */
339:       PetscCuBLASInt il;

341:       MatDenseCUDAGetArray(A, &da);
342:       cusolverDnXpotri_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &il);
343:       if (il > dA->fact_lwork) {
344:         dA->fact_lwork = il;

346:         cudaFree(dA->d_fact_work);
347:         cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
348:       }
349:       PetscLogGpuTimeBegin();
350:       cusolverDnXpotri(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
351:       PetscLogGpuTimeEnd();
352:       MatDenseCUDARestoreArray(A, &da);
353:       /* TODO (write cuda kernel) */
354:       MatSeqDenseSymmetrize_Private(A, PETSC_TRUE);
355:     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytri not implemented");
356:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
357:   #if defined(PETSC_USE_DEBUG)
358:   cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
361:   #endif
362:   PetscLogGpuFlops(1.0 * n * n * n / 3.0);
363:   A->ops->solve          = NULL;
364:   A->ops->solvetranspose = NULL;
365:   A->ops->matsolve       = NULL;
366:   A->factortype          = MAT_FACTOR_NONE;

368:   PetscFree(A->solvertype);
369:   return 0;
370: #else
371:   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
372: #endif
373: }

375: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
376: {
377:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
378:   PetscScalar      *y;
379:   PetscCuBLASInt    m = 0, k = 0;
380:   PetscBool         xiscuda, yiscuda, aiscuda;

383:   PetscCuBLASIntCast(A->rmap->n, &m);
384:   PetscCuBLASIntCast(A->cmap->n, &k);
385:   PetscObjectTypeCompare((PetscObject)xx, VECSEQCUDA, &xiscuda);
386:   PetscObjectTypeCompare((PetscObject)yy, VECSEQCUDA, &yiscuda);
387:   {
388:     const PetscScalar *x;
389:     PetscBool          xishost = PETSC_TRUE;

391:     /* The logic here is to try to minimize the amount of memory copying:
392:        if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
393:        data is not offloaded to the GPU yet, then the data is copied to the
394:        GPU.  But we are only trying to get the data in order to copy it into the y
395:        array.  So the array x will be wherever the data already is so that
396:        only one memcpy is performed */
397:     if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
398:       VecCUDAGetArrayRead(xx, &x);
399:       xishost = PETSC_FALSE;
400:     } else {
401:       VecGetArrayRead(xx, &x);
402:     }
403:     if (k < m || !yiscuda) {
404:       if (!dA->workvec) VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
405:       VecCUDAGetArrayWrite(dA->workvec, &y);
406:     } else {
407:       VecCUDAGetArrayWrite(yy, &y);
408:     }
409:     cudaMemcpy(y, x, m * sizeof(PetscScalar), xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);
410:   }
411:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda);
412:   if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
413:   (*matsolve)(A, y, m, m, 1, k, transpose);
414:   if (!aiscuda) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
415:   if (k < m || !yiscuda) {
416:     PetscScalar *yv;

418:     /* The logic here is that the data is not yet in either yy's GPU array or its
419:        CPU array.  There is nothing in the interface to say where the user would like
420:        it to end up.  So we choose the GPU, because it is the faster option */
421:     if (yiscuda) {
422:       VecCUDAGetArrayWrite(yy, &yv);
423:     } else {
424:       VecGetArray(yy, &yv);
425:     }
426:     cudaMemcpy(yv, y, k * sizeof(PetscScalar), yiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
427:     if (yiscuda) {
428:       VecCUDARestoreArrayWrite(yy, &yv);
429:     } else {
430:       VecRestoreArray(yy, &yv);
431:     }
432:     VecCUDARestoreArrayWrite(dA->workvec, &y);
433:   } else {
434:     VecCUDARestoreArrayWrite(yy, &y);
435:   }
436:   return 0;
437: }

439: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
440: {
441:   PetscScalar   *y;
442:   PetscInt       n, _ldb, _ldx;
443:   PetscBool      biscuda, xiscuda, aiscuda;
444:   PetscCuBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;

447:   PetscCuBLASIntCast(A->rmap->n, &m);
448:   PetscCuBLASIntCast(A->cmap->n, &k);
449:   MatGetSize(B, NULL, &n);
450:   PetscCuBLASIntCast(n, &nrhs);
451:   MatDenseGetLDA(B, &_ldb);
452:   PetscCuBLASIntCast(_ldb, &ldb);
453:   MatDenseGetLDA(X, &_ldx);
454:   PetscCuBLASIntCast(_ldx, &ldx);

456:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda);
457:   PetscObjectTypeCompare((PetscObject)X, MATSEQDENSECUDA, &xiscuda);
458:   {
459:     /* The logic here is to try to minimize the amount of memory copying:
460:        if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
461:        data is not offloaded to the GPU yet, then the data is copied to the
462:        GPU.  But we are only trying to get the data in order to copy it into the y
463:        array.  So the array b will be wherever the data already is so that
464:        only one memcpy is performed */
465:     const PetscScalar *b;

467:     /* some copying from B will be involved */
468:     PetscBool bishost = PETSC_TRUE;

470:     if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
471:       MatDenseCUDAGetArrayRead(B, &b);
472:       bishost = PETSC_FALSE;
473:     } else {
474:       MatDenseGetArrayRead(B, &b);
475:     }
476:     if (ldx < m || !xiscuda) {
477:       /* X's array cannot serve as the array (too small or not on device), B's
478:        * array cannot serve as the array (const), so allocate a new array  */
479:       ldy = m;
480:       cudaMalloc((void **)&y, nrhs * m * sizeof(PetscScalar));
481:     } else {
482:       /* X's array should serve as the array */
483:       ldy = ldx;
484:       MatDenseCUDAGetArrayWrite(X, &y);
485:     }
486:     cudaMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);
487:     if (bishost) {
488:       MatDenseRestoreArrayRead(B, &b);
489:     } else {
490:       MatDenseCUDARestoreArrayRead(B, &b);
491:     }
492:   }
493:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda);
494:   if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
495:   (*matsolve)(A, y, ldy, m, nrhs, k, transpose);
496:   if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
497:   if (ldx < m || !xiscuda) {
498:     PetscScalar *x;

500:     /* The logic here is that the data is not yet in either X's GPU array or its
501:        CPU array.  There is nothing in the interface to say where the user would like
502:        it to end up.  So we choose the GPU, because it is the faster option */
503:     if (xiscuda) {
504:       MatDenseCUDAGetArrayWrite(X, &x);
505:     } else {
506:       MatDenseGetArray(X, &x);
507:     }
508:     cudaMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
509:     if (xiscuda) {
510:       MatDenseCUDARestoreArrayWrite(X, &x);
511:     } else {
512:       MatDenseRestoreArray(X, &x);
513:     }
514:     cudaFree(y);
515:   } else {
516:     MatDenseCUDARestoreArrayWrite(X, &y);
517:   }
518:   return 0;
519: }

521: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
522: {
523:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
524:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
525:   const PetscScalar *da;
526:   PetscCuBLASInt     lda;
527:   cusolverDnHandle_t handle;
528:   int                info;

530:   MatDenseCUDAGetArrayRead(A, &da);
531:   PetscCuBLASIntCast(mat->lda, &lda);
532:   PetscCUSOLVERDnGetHandle(&handle);
533:   PetscLogGpuTimeBegin();
534:   PetscInfo(A, "LU solve %d x %d on backend\n", m, k);
535:   cusolverDnXgetrs(handle, T ? CUBLAS_OP_T : CUBLAS_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_info);
536:   PetscLogGpuTimeEnd();
537:   MatDenseCUDARestoreArrayRead(A, &da);
538:   if (PetscDefined(USE_DEBUG)) {
539:     cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
542:   }
543:   PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
544:   return 0;
545: }

547: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
548: {
549:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
550:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
551:   const PetscScalar *da;
552:   PetscCuBLASInt     lda;
553:   cusolverDnHandle_t handle;
554:   int                info;

556:   MatDenseCUDAGetArrayRead(A, &da);
557:   PetscCuBLASIntCast(mat->lda, &lda);
558:   PetscCUSOLVERDnGetHandle(&handle);
559:   PetscLogGpuTimeBegin();
560:   PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k);
561:   if (!dA->d_fact_ipiv) { /* spd */
562:     /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
563:     cusolverDnXpotrs(handle, CUBLAS_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_info);
564:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytrs not implemented");
565:   PetscLogGpuTimeEnd();
566:   MatDenseCUDARestoreArrayRead(A, &da);
567:   if (PetscDefined(USE_DEBUG)) {
568:     cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
571:   }
572:   PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
573:   return 0;
574: }

576: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
577: {
578:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
579:   Mat_SeqDenseCUDA  *dA  = (Mat_SeqDenseCUDA *)A->spptr;
580:   const PetscScalar *da;
581:   PetscCuBLASInt     lda, rank;
582:   cusolverDnHandle_t handle;
583:   cublasHandle_t     bhandle;
584:   int                info;
585:   cublasOperation_t  trans;
586:   PetscScalar        one = 1.;

588:   PetscCuBLASIntCast(mat->rank, &rank);
589:   MatDenseCUDAGetArrayRead(A, &da);
590:   PetscCuBLASIntCast(mat->lda, &lda);
591:   PetscCUSOLVERDnGetHandle(&handle);
592:   PetscCUBLASGetHandle(&bhandle);
593:   PetscLogGpuTimeBegin();
594:   PetscInfo(A, "QR solve %d x %d on backend\n", m, k);
595:   if (!T) {
596:     if (PetscDefined(USE_COMPLEX)) {
597:       trans = CUBLAS_OP_C;
598:     } else {
599:       trans = CUBLAS_OP_T;
600:     }
601:     cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
602:     if (PetscDefined(USE_DEBUG)) {
603:       cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
605:     }
606:     cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
607:   } else {
608:     cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
609:     cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
610:     if (PetscDefined(USE_DEBUG)) {
611:       cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
613:     }
614:   }
615:   PetscLogGpuTimeEnd();
616:   MatDenseCUDARestoreArrayRead(A, &da);
617:   PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank)));
618:   return 0;
619: }

621: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
622: {
623:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
624:   return 0;
625: }

627: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
628: {
629:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
630:   return 0;
631: }

633: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
634: {
635:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
636:   return 0;
637: }

639: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
640: {
641:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
642:   return 0;
643: }

645: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
646: {
647:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
648:   return 0;
649: }

651: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
652: {
653:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
654:   return 0;
655: }

657: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
658: {
659:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
660:   return 0;
661: }

663: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
664: {
665:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
666:   return 0;
667: }

669: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
670: {
671:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
672:   return 0;
673: }

675: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
676: {
677:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
678:   return 0;
679: }

681: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
682: {
683:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
684:   return 0;
685: }

687: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
688: {
689:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
690:   return 0;
691: }

693: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
694: {
695:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
696:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
697:   PetscScalar      *da;
698:   PetscCuBLASInt    m, n, lda;
699: #if defined(PETSC_USE_DEBUG)
700:   int info;
701: #endif
702:   cusolverDnHandle_t handle;

704:   if (!A->rmap->n || !A->cmap->n) return 0;
705:   PetscCUSOLVERDnGetHandle(&handle);
706:   MatDenseCUDAGetArray(A, &da);
707:   PetscCuBLASIntCast(A->cmap->n, &n);
708:   PetscCuBLASIntCast(A->rmap->n, &m);
709:   PetscCuBLASIntCast(a->lda, &lda);
710:   PetscInfo(A, "LU factor %d x %d on backend\n", m, n);
711:   if (!dA->d_fact_ipiv) cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
712:   if (!dA->fact_lwork) {
713:     cusolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
714:     cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
715:   }
716:   if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
717:   PetscLogGpuTimeBegin();
718:   cusolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->d_fact_ipiv, dA->d_fact_info);
719:   PetscLogGpuTimeEnd();
720:   MatDenseCUDARestoreArray(A, &da);
721: #if defined(PETSC_USE_DEBUG)
722:   cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
725: #endif
726:   A->factortype = MAT_FACTOR_LU;
727:   PetscLogGpuFlops(2.0 * n * n * m / 3.0);

729:   A->ops->solve             = MatSolve_SeqDenseCUDA_LU;
730:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_LU;
731:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_LU;
732:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;

734:   PetscFree(A->solvertype);
735:   PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
736:   return 0;
737: }

739: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A, IS perm, const MatFactorInfo *factinfo)
740: {
741:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
742:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
743:   PetscScalar      *da;
744:   PetscCuBLASInt    n, lda;
745: #if defined(PETSC_USE_DEBUG)
746:   int info;
747: #endif
748:   cusolverDnHandle_t handle;

750:   if (!A->rmap->n || !A->cmap->n) return 0;
751:   PetscCUSOLVERDnGetHandle(&handle);
752:   PetscCuBLASIntCast(A->rmap->n, &n);
753:   PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n);
754:   if (A->spd == PETSC_BOOL3_TRUE) {
755:     MatDenseCUDAGetArray(A, &da);
756:     PetscCuBLASIntCast(a->lda, &lda);
757:     if (!dA->fact_lwork) {
758:       cusolverDnXpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork);
759:       cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
760:     }
761:     if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
762:     PetscLogGpuTimeBegin();
763:     cusolverDnXpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
764:     PetscLogGpuTimeEnd();

766:     MatDenseCUDARestoreArray(A, &da);
767: #if defined(PETSC_USE_DEBUG)
768:     cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
771: #endif
772:     A->factortype = MAT_FACTOR_CHOLESKY;
773:     PetscLogGpuFlops(1.0 * n * n * n / 3.0);
774:   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
775: #if 0
776:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
777:        The code below should work, and it can be activated when *sytrs routines will be available */
778:     if (!dA->d_fact_ipiv) {
779:       cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));
780:     }
781:     if (!dA->fact_lwork) {
782:       cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);
783:       cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
784:     }
785:     if (!dA->d_fact_info) {
786:       cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
787:     }
788:     PetscLogGpuTimeBegin();
789:     cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
790:     PetscLogGpuTimeEnd();
791: #endif

793:   A->ops->solve             = MatSolve_SeqDenseCUDA_Cholesky;
794:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_Cholesky;
795:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_Cholesky;
796:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
797:   PetscFree(A->solvertype);
798:   PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
799:   return 0;
800: }

802: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A, IS col, const MatFactorInfo *factinfo)
803: {
804:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
805:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
806:   PetscScalar      *da;
807:   PetscCuBLASInt    m, min, max, n, lda;
808: #if defined(PETSC_USE_DEBUG)
809:   int info;
810: #endif
811:   cusolverDnHandle_t handle;

813:   if (!A->rmap->n || !A->cmap->n) return 0;
814:   PetscCUSOLVERDnGetHandle(&handle);
815:   MatDenseCUDAGetArray(A, &da);
816:   PetscCuBLASIntCast(A->cmap->n, &n);
817:   PetscCuBLASIntCast(A->rmap->n, &m);
818:   PetscCuBLASIntCast(a->lda, &lda);
819:   PetscInfo(A, "QR factor %d x %d on backend\n", m, n);
820:   max = PetscMax(m, n);
821:   min = PetscMin(m, n);
822:   if (!dA->d_fact_tau) cudaMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau));
823:   if (!dA->d_fact_ipiv) cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
824:   if (!dA->fact_lwork) {
825:     cusolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
826:     cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
827:   }
828:   if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
829:   if (!dA->workvec) VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
830:   PetscLogGpuTimeBegin();
831:   cusolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
832:   PetscLogGpuTimeEnd();
833:   MatDenseCUDARestoreArray(A, &da);
834: #if defined(PETSC_USE_DEBUG)
835:   cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
837: #endif
838:   A->factortype = MAT_FACTOR_QR;
839:   a->rank       = min;
840:   PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0));

842:   A->ops->solve             = MatSolve_SeqDenseCUDA_QR;
843:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_QR;
844:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_QR;
845:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;

847:   PetscFree(A->solvertype);
848:   PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
849:   return 0;
850: }

852: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
853: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
854: {
855:   const PetscScalar *da, *db;
856:   PetscScalar       *dc;
857:   PetscScalar        one = 1.0, zero = 0.0;
858:   PetscCuBLASInt     m, n, k;
859:   PetscInt           alda, blda, clda;
860:   cublasHandle_t     cublasv2handle;
861:   PetscBool          Aiscuda, Biscuda;
862:   cublasStatus_t     berr;

864:   /* we may end up with SEQDENSE as one of the arguments */
865:   PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &Aiscuda);
866:   PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &Biscuda);
867:   if (!Aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
868:   if (!Biscuda) MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
869:   PetscCuBLASIntCast(C->rmap->n, &m);
870:   PetscCuBLASIntCast(C->cmap->n, &n);
871:   if (tA) PetscCuBLASIntCast(A->rmap->n, &k);
872:   else PetscCuBLASIntCast(A->cmap->n, &k);
873:   if (!m || !n || !k) return 0;
874:   PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n);
875:   MatDenseCUDAGetArrayRead(A, &da);
876:   MatDenseCUDAGetArrayRead(B, &db);
877:   MatDenseCUDAGetArrayWrite(C, &dc);
878:   MatDenseGetLDA(A, &alda);
879:   MatDenseGetLDA(B, &blda);
880:   MatDenseGetLDA(C, &clda);
881:   PetscCUBLASGetHandle(&cublasv2handle);
882:   PetscLogGpuTimeBegin();
883:   berr = cublasXgemm(cublasv2handle, tA ? CUBLAS_OP_T : CUBLAS_OP_N, tB ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda);
884:   berr;
885:   PetscLogGpuTimeEnd();
886:   PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1));
887:   MatDenseCUDARestoreArrayRead(A, &da);
888:   MatDenseCUDARestoreArrayRead(B, &db);
889:   MatDenseCUDARestoreArrayWrite(C, &dc);
890:   if (!Aiscuda) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
891:   if (!Biscuda) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
892:   return 0;
893: }

895: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
896: {
897:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_TRUE, PETSC_FALSE);
898:   return 0;
899: }

901: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
902: {
903:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_FALSE);
904:   return 0;
905: }

907: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
908: {
909:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_TRUE);
910:   return 0;
911: }

913: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
914: {
915:   MatProductSetFromOptions_SeqDense(C);
916:   return 0;
917: }

919: /* zz = op(A)*xx + yy
920:    if yy == NULL, only MatMult */
921: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
922: {
923:   Mat_SeqDense      *mat = (Mat_SeqDense *)A->data;
924:   const PetscScalar *xarray, *da;
925:   PetscScalar       *zarray;
926:   PetscScalar        one = 1.0, zero = 0.0;
927:   PetscCuBLASInt     m, n, lda;
928:   cublasHandle_t     cublasv2handle;
929:   cublasStatus_t     berr;

931:   /* mult add */
932:   if (yy && yy != zz) VecCopy_SeqCUDA(yy, zz);
933:   if (!A->rmap->n || !A->cmap->n) {
934:     /* mult only */
935:     if (!yy) VecSet_SeqCUDA(zz, 0.0);
936:     return 0;
937:   }
938:   PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n);
939:   PetscCuBLASIntCast(A->rmap->n, &m);
940:   PetscCuBLASIntCast(A->cmap->n, &n);
941:   PetscCUBLASGetHandle(&cublasv2handle);
942:   MatDenseCUDAGetArrayRead(A, &da);
943:   PetscCuBLASIntCast(mat->lda, &lda);
944:   VecCUDAGetArrayRead(xx, &xarray);
945:   VecCUDAGetArray(zz, &zarray);
946:   PetscLogGpuTimeBegin();
947:   berr = cublasXgemv(cublasv2handle, trans ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1);
948:   berr;
949:   PetscLogGpuTimeEnd();
950:   PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n));
951:   VecCUDARestoreArrayRead(xx, &xarray);
952:   VecCUDARestoreArray(zz, &zarray);
953:   MatDenseCUDARestoreArrayRead(A, &da);
954:   return 0;
955: }

957: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
958: {
959:   MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_FALSE);
960:   return 0;
961: }

963: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
964: {
965:   MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_TRUE);
966:   return 0;
967: }

969: PetscErrorCode MatMult_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
970: {
971:   MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_FALSE);
972:   return 0;
973: }

975: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
976: {
977:   MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_TRUE);
978:   return 0;
979: }

981: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **array)
982: {
983:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

985:   MatSeqDenseCUDACopyFromGPU(A);
986:   *array = mat->v;
987:   return 0;
988: }

990: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **array)
991: {
992:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

994:   /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
995:   if (!mat->v) MatSeqDenseSetPreallocation(A, NULL);
996:   *array         = mat->v;
997:   A->offloadmask = PETSC_OFFLOAD_CPU;
998:   return 0;
999: }

1001: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A, PetscScalar **array)
1002: {
1003:   Mat_SeqDense *mat = (Mat_SeqDense *)A->data;

1005:   MatSeqDenseCUDACopyFromGPU(A);
1006:   *array         = mat->v;
1007:   A->offloadmask = PETSC_OFFLOAD_CPU;
1008:   return 0;
1009: }

1011: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y, PetscScalar alpha)
1012: {
1013:   Mat_SeqDense  *y = (Mat_SeqDense *)Y->data;
1014:   PetscScalar   *dy;
1015:   PetscCuBLASInt j, N, m, lday, one = 1;
1016:   cublasHandle_t cublasv2handle;

1018:   PetscCUBLASGetHandle(&cublasv2handle);
1019:   MatDenseCUDAGetArray(Y, &dy);
1020:   PetscCuBLASIntCast(Y->rmap->n * Y->cmap->n, &N);
1021:   PetscCuBLASIntCast(Y->rmap->n, &m);
1022:   PetscCuBLASIntCast(y->lda, &lday);
1023:   PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1024:   PetscLogGpuTimeBegin();
1025:   if (lday > m) {
1026:     for (j = 0; j < Y->cmap->n; j++) cublasXscal(cublasv2handle, m, &alpha, dy + lday * j, one);
1027:   } else cublasXscal(cublasv2handle, N, &alpha, dy, one);
1028:   PetscLogGpuTimeEnd();
1029:   PetscLogGpuFlops(N);
1030:   MatDenseCUDARestoreArray(Y, &dy);
1031:   return 0;
1032: }

1034: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1035:   const PetscScalar shift_;
1036:   petscshift(PetscScalar shift) : shift_(shift) { }
1037:   __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1038: };

1040: template <typename Iterator>
1041: class strided_range {
1042: public:
1043:   typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1044:   struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1045:     difference_type stride;
1046:     stride_functor(difference_type stride) : stride(stride) { }
1047:     __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1048:   };
1049:   typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
1050:   typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1051:   typedef typename thrust::permutation_iterator<Iterator, TransformIterator>    PermutationIterator;
1052:   typedef PermutationIterator                                                   iterator; // type of the strided_range iterator
1053:   // construct strided_range for the range [first,last)
1054:   strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1055:   iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1056:   iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }

1058: protected:
1059:   Iterator        first;
1060:   Iterator        last;
1061:   difference_type stride;
1062: };

1064: PetscErrorCode MatShift_DenseCUDA_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1065: {
1066:   PetscInt rend2 = PetscMin(rend, cols);
1067:   if (rend2 > rstart) {
1068:     PetscLogGpuTimeBegin();
1069:     try {
1070:       const auto                                                  dptr  = thrust::device_pointer_cast(da);
1071:       size_t                                                      begin = rstart * lda;
1072:       size_t                                                      end   = rend2 - rstart + rend2 * lda;
1073:       strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1074:       thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1075:     } catch (char *ex) {
1076:       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1077:     }
1078:     PetscLogGpuTimeEnd();
1079:     PetscLogGpuFlops(rend2 - rstart);
1080:   }
1081:   return 0;
1082: }

1084: PetscErrorCode MatShift_SeqDenseCUDA(Mat A, PetscScalar alpha)
1085: {
1086:   PetscScalar *da;
1087:   PetscInt     m = A->rmap->n, n = A->cmap->n, lda;

1089:   MatDenseCUDAGetArray(A, &da);
1090:   MatDenseGetLDA(A, &lda);
1091:   PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n);
1092:   MatShift_DenseCUDA_Private(da, alpha, lda, 0, m, n);
1093:   MatDenseCUDARestoreArray(A, &da);
1094:   return 0;
1095: }

1097: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1098: {
1099:   Mat_SeqDense      *x = (Mat_SeqDense *)X->data;
1100:   Mat_SeqDense      *y = (Mat_SeqDense *)Y->data;
1101:   const PetscScalar *dx;
1102:   PetscScalar       *dy;
1103:   PetscCuBLASInt     j, N, m, ldax, lday, one = 1;
1104:   cublasHandle_t     cublasv2handle;

1106:   if (!X->rmap->n || !X->cmap->n) return 0;
1107:   PetscCUBLASGetHandle(&cublasv2handle);
1108:   MatDenseCUDAGetArrayRead(X, &dx);
1109:   if (alpha == 0.0) MatDenseCUDAGetArrayWrite(Y, &dy);
1110:   else MatDenseCUDAGetArray(Y, &dy);
1111:   PetscCuBLASIntCast(X->rmap->n * X->cmap->n, &N);
1112:   PetscCuBLASIntCast(X->rmap->n, &m);
1113:   PetscCuBLASIntCast(x->lda, &ldax);
1114:   PetscCuBLASIntCast(y->lda, &lday);
1115:   PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1116:   PetscLogGpuTimeBegin();
1117:   if (ldax > m || lday > m) {
1118:     for (j = 0; j < X->cmap->n; j++) cublasXaxpy(cublasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one);
1119:   } else cublasXaxpy(cublasv2handle, N, &alpha, dx, one, dy, one);
1120:   PetscLogGpuTimeEnd();
1121:   PetscLogGpuFlops(PetscMax(2. * N - 1, 0));
1122:   MatDenseCUDARestoreArrayRead(X, &dx);
1123:   if (alpha == 0.0) MatDenseCUDARestoreArrayWrite(Y, &dy);
1124:   else MatDenseCUDARestoreArray(Y, &dy);
1125:   return 0;
1126: }

1128: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1129: {
1130:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

1132:   if (dA) {
1134:     if (!dA->user_alloc) cudaFree(dA->d_v);
1135:     cudaFree(dA->d_fact_tau);
1136:     cudaFree(dA->d_fact_ipiv);
1137:     cudaFree(dA->d_fact_info);
1138:     cudaFree(dA->d_fact_work);
1139:     VecDestroy(&dA->workvec);
1140:   }
1141:   PetscFree(A->spptr);
1142:   return 0;
1143: }

1145: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1146: {
1147:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1149:   /* prevent to copy back data if we own the data pointer */
1150:   if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1151:   MatConvert_SeqDenseCUDA_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
1152:   MatDestroy_SeqDense(A);
1153:   return 0;
1154: }

1156: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
1157: {
1158:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1160:   MatCreate(PetscObjectComm((PetscObject)A), B);
1161:   MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1162:   MatSetType(*B, ((PetscObject)A)->type_name);
1163:   MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues);
1164:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) MatCopy_SeqDenseCUDA(A, *B, SAME_NONZERO_PATTERN);
1165:   if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1166:     Mat_SeqDenseCUDA *dB = (Mat_SeqDenseCUDA *)(*B)->spptr;
1167:     if (!dB->d_v) MatSeqDenseCUDASetPreallocation(*B, NULL);
1168:   }
1169:   return 0;
1170: }

1172: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A, Vec v, PetscInt col)
1173: {
1174:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
1175:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1176:   PetscScalar      *x;
1177:   PetscBool         viscuda;

1179:   PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, VECCUDA, "");
1180:   if (viscuda && !v->boundtocpu) { /* update device data */
1181:     VecCUDAGetArrayWrite(v, &x);
1182:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1183:       cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToHost);
1184:     } else {
1185:       cudaMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
1186:     }
1187:     VecCUDARestoreArrayWrite(v, &x);
1188:   } else { /* update host data */
1189:     VecGetArrayWrite(v, &x);
1190:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1191:       PetscArraycpy(x, a->v + col * a->lda, A->rmap->n);
1192:     } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1193:       cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1194:     }
1195:     VecRestoreArrayWrite(v, &x);
1196:   }
1197:   return 0;
1198: }

1200: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A, MatFactorType ftype, Mat *fact)
1201: {
1202:   MatCreate(PetscObjectComm((PetscObject)A), fact);
1203:   MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1204:   MatSetType(*fact, MATSEQDENSECUDA);
1205:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1206:     (*fact)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1207:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1208:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1209:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1210:   } else if (ftype == MAT_FACTOR_QR) {
1211:     PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense);
1212:     PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense);
1213:   }
1214:   (*fact)->factortype = ftype;
1215:   PetscFree((*fact)->solvertype);
1216:   PetscStrallocpy(MATSOLVERCUDA, &(*fact)->solvertype);
1217:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1218:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1219:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1220:   PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1221:   return 0;
1222: }

1224: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1225: {
1226:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1230:   MatDenseCUDAGetArray(A, (PetscScalar **)&a->ptrinuse);
1231:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1232:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1233:   }
1234:   a->vecinuse = col + 1;
1235:   VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1236:   *v = a->cvec;
1237:   return 0;
1238: }

1240: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1241: {
1242:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1246:   a->vecinuse = 0;
1247:   VecCUDAResetArray(a->cvec);
1248:   MatDenseCUDARestoreArray(A, (PetscScalar **)&a->ptrinuse);
1249:   if (v) *v = NULL;
1250:   return 0;
1251: }

1253: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1254: {
1255:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1259:   MatDenseCUDAGetArrayRead(A, &a->ptrinuse);
1260:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1261:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1262:   }
1263:   a->vecinuse = col + 1;
1264:   VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1265:   VecLockReadPush(a->cvec);
1266:   *v = a->cvec;
1267:   return 0;
1268: }

1270: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1271: {
1272:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1276:   a->vecinuse = 0;
1277:   VecLockReadPop(a->cvec);
1278:   VecCUDAResetArray(a->cvec);
1279:   MatDenseCUDARestoreArrayRead(A, &a->ptrinuse);
1280:   if (v) *v = NULL;
1281:   return 0;
1282: }

1284: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1285: {
1286:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1290:   MatDenseCUDAGetArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1291:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1292:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1293:   }
1294:   a->vecinuse = col + 1;
1295:   VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1296:   *v = a->cvec;
1297:   return 0;
1298: }

1300: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1301: {
1302:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1306:   a->vecinuse = 0;
1307:   VecCUDAResetArray(a->cvec);
1308:   MatDenseCUDARestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1309:   if (v) *v = NULL;
1310:   return 0;
1311: }

1313: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1314: {
1315:   Mat_SeqDense     *a  = (Mat_SeqDense *)A->data;
1316:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;

1320:   if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) MatDestroy(&a->cmat);
1321:   MatSeqDenseCUDACopyToGPU(A);
1322:   if (!a->cmat) {
1323:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat);
1324:   } else {
1325:     MatDenseCUDAPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda);
1326:   }
1327:   MatDenseSetLDA(a->cmat, a->lda);
1328:   /* Place CPU array if present but not copy any data */
1329:   a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1330:   if (a->v) MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda);
1331:   a->cmat->offloadmask = A->offloadmask;
1332:   a->matinuse          = cbegin + 1;
1333:   *v                   = a->cmat;
1334:   return 0;
1335: }

1337: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A, Mat *v)
1338: {
1339:   Mat_SeqDense    *a    = (Mat_SeqDense *)A->data;
1340:   PetscBool        copy = PETSC_FALSE, reset;
1341:   PetscOffloadMask suboff;

1346:   a->matinuse = 0;
1347:   reset       = a->v ? PETSC_TRUE : PETSC_FALSE;
1348:   suboff      = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1349:   if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1350:     copy = PETSC_TRUE;
1351:     MatSeqDenseSetPreallocation(A, NULL);
1352:   }
1353:   MatDenseCUDAResetArray(a->cmat);
1354:   if (reset) MatDenseResetArray(a->cmat);
1355:   if (copy) {
1356:     MatSeqDenseCUDACopyFromGPU(A);
1357:   } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1358:   a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1359:   if (v) *v = NULL;
1360:   return 0;
1361: }

1363: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A, PetscInt lda)
1364: {
1365:   Mat_SeqDense     *cA = (Mat_SeqDense *)A->data;
1366:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1367:   PetscBool         data;

1369:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1372:   cA->lda = lda;
1373:   return 0;
1374: }

1376: static PetscErrorCode MatSetUp_SeqDenseCUDA(Mat A)
1377: {
1378:   PetscLayoutSetUp(A->rmap);
1379:   PetscLayoutSetUp(A->cmap);
1380:   if (!A->preallocated) MatSeqDenseCUDASetPreallocation(A, NULL);
1381:   return 0;
1382: }

1384: static PetscErrorCode MatSetRandom_SeqDenseCUDA(Mat A, PetscRandom r)
1385: {
1386:   PetscBool iscurand;

1388:   PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand);
1389:   if (iscurand) {
1390:     PetscScalar *a;
1391:     PetscInt     lda, m, n, mn = 0;

1393:     MatGetSize(A, &m, &n);
1394:     MatDenseGetLDA(A, &lda);
1395:     MatDenseCUDAGetArrayWrite(A, &a);
1396:     if (lda > m) {
1397:       for (PetscInt i = 0; i < n; i++) PetscRandomGetValues(r, m, a + i * lda);
1398:     } else {
1399:       PetscIntMultError(m, n, &mn);
1400:       PetscRandomGetValues(r, mn, a);
1401:     }
1402:     MatDenseCUDARestoreArrayWrite(A, &a);
1403:   } else MatSetRandom_SeqDense(A, r);
1404:   return 0;
1405: }

1407: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A, PetscBool flg)
1408: {
1409:   Mat_SeqDense *a = (Mat_SeqDense *)A->data;

1413:   A->boundtocpu = flg;
1414:   if (!flg) {
1415:     PetscBool iscuda;

1417:     PetscFree(A->defaultrandtype);
1418:     PetscStrallocpy(PETSCCURAND, &A->defaultrandtype);
1419:     PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQCUDA, &iscuda);
1420:     if (!iscuda) VecDestroy(&a->cvec);
1421:     PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSECUDA, &iscuda);
1422:     if (!iscuda) MatDestroy(&a->cmat);
1423:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA);
1424:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA);
1425:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseCUDA);
1426:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseCUDA);
1427:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseCUDA);
1428:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseCUDA);
1429:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1430:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseCUDA);
1431:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1432:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseCUDA);
1433:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseCUDA);
1434:     PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseCUDA);
1435:     PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseCUDA);

1437:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1438:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1439:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1440:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1441:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1442:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1443:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1444:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1445:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1446:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1447:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1448:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1449:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1450:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1451:     A->ops->shift                   = MatShift_SeqDenseCUDA;
1452:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1453:     A->ops->zeroentries             = MatZeroEntries_SeqDenseCUDA;
1454:     A->ops->setup                   = MatSetUp_SeqDenseCUDA;
1455:     A->ops->setrandom               = MatSetRandom_SeqDenseCUDA;
1456:   } else {
1457:     /* make sure we have an up-to-date copy on the CPU */
1458:     MatSeqDenseCUDACopyFromGPU(A);
1459:     PetscFree(A->defaultrandtype);
1460:     PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype);
1461:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense);
1462:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
1463:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense);
1464:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense);
1465:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense);
1466:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense);
1467:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense);
1468:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense);
1469:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense);
1470:     PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense);
1471:     PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense);
1472:     PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense);
1473:     PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense);

1475:     A->ops->duplicate               = MatDuplicate_SeqDense;
1476:     A->ops->mult                    = MatMult_SeqDense;
1477:     A->ops->multadd                 = MatMultAdd_SeqDense;
1478:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1479:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1480:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1481:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1482:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1483:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1484:     A->ops->axpy                    = MatAXPY_SeqDense;
1485:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1486:     A->ops->lufactor                = MatLUFactor_SeqDense;
1487:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1488:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1489:     A->ops->scale                   = MatScale_SeqDense;
1490:     A->ops->shift                   = MatShift_SeqDense;
1491:     A->ops->copy                    = MatCopy_SeqDense;
1492:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1493:     A->ops->setup                   = MatSetUp_SeqDense;
1494:     A->ops->setrandom               = MatSetRandom_SeqDense;
1495:   }
1496:   if (a->cmat) MatBindToCPU(a->cmat, flg);
1497:   return 0;
1498: }

1500: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1501: {
1502:   Mat           B;
1503:   Mat_SeqDense *a;

1505:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1506:     /* TODO these cases should be optimized */
1507:     MatConvert_Basic(M, type, reuse, newmat);
1508:     return 0;
1509:   }

1511:   B = *newmat;
1512:   MatBindToCPU_SeqDenseCUDA(B, PETSC_TRUE);
1513:   MatReset_SeqDenseCUDA(B);
1514:   PetscFree(B->defaultvectype);
1515:   PetscStrallocpy(VECSTANDARD, &B->defaultvectype);
1516:   PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE);
1517:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", NULL);
1518:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", NULL);
1519:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", NULL);
1520:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", NULL);
1521:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", NULL);
1522:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", NULL);
1523:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", NULL);
1524:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", NULL);
1525:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", NULL);
1526:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", NULL);
1527:   PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", NULL);
1528:   a = (Mat_SeqDense *)B->data;
1529:   VecDestroy(&a->cvec); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1530:   B->ops->bindtocpu = NULL;
1531:   B->ops->destroy   = MatDestroy_SeqDense;
1532:   B->offloadmask    = PETSC_OFFLOAD_CPU;
1533:   return 0;
1534: }

1536: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1537: {
1538:   Mat_SeqDenseCUDA *dB;
1539:   Mat               B;
1540:   Mat_SeqDense     *a;

1542:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1543:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1544:     /* TODO these cases should be optimized */
1545:     MatConvert_Basic(M, type, reuse, newmat);
1546:     return 0;
1547:   }

1549:   B = *newmat;
1550:   PetscFree(B->defaultvectype);
1551:   PetscStrallocpy(VECCUDA, &B->defaultvectype);
1552:   PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSECUDA);
1553:   PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1554:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1555:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1556:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1557:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1558:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1559:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1560:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1561:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1562:   PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1563:   PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", MatProductSetFromOptions_SeqAIJ_SeqDense);
1564:   a = (Mat_SeqDense *)B->data;
1565:   VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1566:   PetscNew(&dB);

1568:   B->spptr       = dB;
1569:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1571:   MatBindToCPU_SeqDenseCUDA(B, PETSC_FALSE);
1572:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1573:   B->ops->destroy   = MatDestroy_SeqDenseCUDA;
1574:   return 0;
1575: }

1577: /*@C
1578:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1580:    Collective

1582:    Input Parameters:
1583: +  comm - MPI communicator
1584: .  m - number of rows
1585: .  n - number of columns
1586: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1587:    to control matrix memory allocation.

1589:    Output Parameter:
1590: .  A - the matrix

1592:    Level: intermediate

1594: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1595: @*/
1596: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1597: {
1598:   PetscMPIInt size;

1600:   MPI_Comm_size(comm, &size);
1602:   MatCreate(comm, A);
1603:   MatSetSizes(*A, m, n, m, n);
1604:   MatSetType(*A, MATSEQDENSECUDA);
1605:   MatSeqDenseCUDASetPreallocation(*A, data);
1606:   return 0;
1607: }

1609: /*MC
1610:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1612:    Options Database Keys:
1613: . -mat_type seqdensecuda - sets the matrix type to `MATSEQDENSECUDA` during a call to MatSetFromOptions()

1615:   Level: beginner

1617: .seealso: `MATSEQDENSE`
1618: M*/
1619: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1620: {
1621:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1622:   MatCreate_SeqDense(B);
1623:   MatConvert_SeqDense_SeqDenseCUDA(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
1624:   return 0;
1625: }