Actual source code: densecuda.cu

  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsccublas.h>

  9: /* cublas definitions are here */
 10: #include <petsc/private/cudavecimpl.h>

 12: #if defined(PETSC_USE_COMPLEX)
 13: #if defined(PETSC_USE_REAL_SINGLE)
 14: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 18: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 19: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 20: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 21: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 22: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 23: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 24: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnCgeqrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 25: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnCgeqrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(cuComplex*)(g),(h),(i))
 26: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnCunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(l))
 27: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnCunmqr((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(cuComplex*)(l),(m),(n))
 28: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasCtrsm((a),(b),(c),(d),(e),(f),(g),(cuComplex*)(h),(cuComplex*)(i),(j),(cuComplex*)(k),(l))
 29: #else /* complex double */
 30: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 31: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 32: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 33: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 34: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 35: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 36: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 37: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 38: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 39: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 40: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnZgeqrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 41: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnZgeqrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(cuDoubleComplex*)(g),(h),(i))
 42: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnZunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(l))
 43: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnZunmqr((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(m),(n))
 44: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasZtrsm((a),(b),(c),(d),(e),(f),(g),(cuDoubleComplex*)(h),(cuDoubleComplex*)(i),(j),(cuDoubleComplex*)(k),(l))
 45: #endif
 46: #else /* real single */
 47: #if defined(PETSC_USE_REAL_SINGLE)
 48: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 49: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 50: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 51: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 52: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 53: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 54: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 55: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 56: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 57: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 58: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnSgeqrf_bufferSize((a),(b),(c),(float*)(d),(e),(f))
 59: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnSgeqrf((a),(b),(c),(float*)(d),(e),(float*)(f),(float*)(g),(h),(i))
 60: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnSormqr_bufferSize((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(l))
 61: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnSormqr((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(float*)(l),(m),(n))
 62: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasStrsm((a),(b),(c),(d),(e),(f),(g),(float*)(h),(float*)(i),(j),(float*)(k),(l))
 63: #else /* real double */
 64: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 65: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 66: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 67: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 68: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 69: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 70: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 71: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 72: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 73: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 74: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnDgeqrf_bufferSize((a),(b),(c),(double*)(d),(e),(f))
 75: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnDgeqrf((a),(b),(c),(double*)(d),(e),(double*)(f),(double*)(g),(h),(i))
 76: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnDormqr_bufferSize((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(l))
 77: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnDormqr((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(double*)(l),(m),(n))
 78: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasDtrsm((a),(b),(c),(d),(e),(f),(g),(double*)(h),(double*)(i),(j),(double*)(k),(l))
 79: #endif
 80: #endif

 82: typedef struct {
 83:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 84:   PetscBool   user_alloc;
 85:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 86:   PetscBool   unplaced_user_alloc;
 87:   /* factorization support */
 88:   PetscCuBLASInt *d_fact_ipiv; /* device pivots */
 89:   PetscScalar *d_fact_tau;  /* device QR tau vector */
 90:   PetscScalar *d_fact_work; /* device workspace */
 91:   PetscCuBLASInt fact_lwork;
 92:   PetscCuBLASInt *d_fact_info; /* device info */
 93:   /* workspace */
 94:   Vec         workvec;
 95: } Mat_SeqDenseCUDA;

 97: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 98: {
 99:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
100:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
101:   PetscErrorCode   ierr;
102:   PetscBool        iscuda;
103:   cudaError_t      cerr;

106:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
107:   if (!iscuda) return(0);
108:   /* it may happen CPU preallocation has not been performed */
109:   PetscLayoutSetUp(A->rmap);
110:   PetscLayoutSetUp(A->cmap);
111:   if (cA->lda <= 0) cA->lda = A->rmap->n;
112:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
113:   if (!d_data) { /* petsc-allocated storage */
114:     size_t sz;
115:     PetscIntMultError(cA->lda,A->cmap->n,NULL);
116:     sz   = cA->lda*A->cmap->n*sizeof(PetscScalar);
117:     cerr = cudaMalloc((void**)&dA->d_v,sz);CHKERRCUDA(cerr);
118:     cerr = cudaMemset(dA->d_v,0,sz);CHKERRCUDA(cerr);
119:     dA->user_alloc = PETSC_FALSE;
120:   } else { /* user-allocated storage */
121:     dA->d_v        = d_data;
122:     dA->user_alloc = PETSC_TRUE;
123:   }
124:   A->offloadmask  = PETSC_OFFLOAD_GPU;
125:   A->preallocated = PETSC_TRUE;
126:   A->assembled    = PETSC_TRUE;
127:   return(0);
128: }

130: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
131: {
132:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
133:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
134:   PetscErrorCode   ierr;
135:   cudaError_t      cerr;

139:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
140:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
141:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
142:       MatSeqDenseSetPreallocation(A,NULL);
143:     }
144:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
145:     if (cA->lda > A->rmap->n) {
146:       PetscInt n = A->cmap->n,m = A->rmap->n;

148:       cerr = cudaMemcpy2D(cA->v,cA->lda*sizeof(PetscScalar),dA->d_v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
149:     } else {
150:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
151:     }
152:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
153:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

155:     A->offloadmask = PETSC_OFFLOAD_BOTH;
156:   }
157:   return(0);
158: }

160: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
161: {
162:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
163:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
164:   PetscBool        copy;
165:   PetscErrorCode   ierr;
166:   cudaError_t      cerr;

170:   if (A->boundtocpu) return(0);
171:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
172:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
173:   if (copy) {
174:     if (!dA->d_v) { /* Allocate GPU memory if not present */
175:       MatSeqDenseCUDASetPreallocation(A,NULL);
176:     }
177:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
178:     if (cA->lda > A->rmap->n) {
179:       PetscInt n = A->cmap->n,m = A->rmap->n;

181:       cerr = cudaMemcpy2D(dA->d_v,cA->lda*sizeof(PetscScalar),cA->v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
182:     } else {
183:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
184:     }
185:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
186:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

188:     A->offloadmask = PETSC_OFFLOAD_BOTH;
189:   }
190:   return(0);
191: }

193: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
194: {
195:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
196:   PetscErrorCode    ierr;
197:   const PetscScalar *va;
198:   PetscScalar       *vb;
199:   PetscInt          lda1=a->lda,lda2=b->lda,m=A->rmap->n,n=A->cmap->n;
200:   cudaError_t       cerr;

203:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
204:   if (A->ops->copy != B->ops->copy) {
205:     MatCopy_Basic(A,B,str);
206:     return(0);
207:   }
208:   if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
209:   MatDenseCUDAGetArrayRead(A,&va);
210:   MatDenseCUDAGetArrayWrite(B,&vb);
211:   PetscLogGpuTimeBegin();
212:   if (lda1>m || lda2>m) {
213:     cerr = cudaMemcpy2D(vb,lda2*sizeof(PetscScalar),va,lda1*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
214:   } else {
215:     cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
216:   }
217:   PetscLogGpuTimeEnd();
218:   MatDenseCUDARestoreArrayWrite(B,&vb);
219:   MatDenseCUDARestoreArrayRead(A,&va);
220:   return(0);
221: }

223: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
224: {
225:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
226:   PetscErrorCode    ierr;
227:   PetscScalar       *va;
228:   PetscInt          lda=a->lda,m = A->rmap->n,n = A->cmap->n;
229:   cudaError_t       cerr;

232:   MatDenseCUDAGetArrayWrite(A,&va);
233:   PetscLogGpuTimeBegin();
234:   if (lda>m) {
235:     cerr = cudaMemset2D(va,lda*sizeof(PetscScalar),0,m*sizeof(PetscScalar),n);CHKERRCUDA(cerr);
236:   } else {
237:     cerr = cudaMemset(va,0,m*(n*sizeof(PetscScalar)));CHKERRCUDA(cerr);
238:   }
239:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
240:   PetscLogGpuTimeEnd();
241:   MatDenseCUDARestoreArrayWrite(A,&va);
242:   return(0);
243: }

245: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
246: {
247:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
248:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
249:   PetscErrorCode   ierr;

252:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
253:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
254:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
255:   if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
256:   dA->unplacedarray = dA->d_v;
257:   dA->unplaced_user_alloc = dA->user_alloc;
258:   dA->d_v = (PetscScalar*)a;
259:   dA->user_alloc = PETSC_TRUE;
260:   return(0);
261: }

263: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
264: {
265:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
266:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
267:   PetscErrorCode   ierr;

270:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
271:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
272:   if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
273:   dA->d_v = dA->unplacedarray;
274:   dA->user_alloc = dA->unplaced_user_alloc;
275:   dA->unplacedarray = NULL;
276:   return(0);
277: }

279: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
280: {
281:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
282:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
283:   cudaError_t      cerr;

286:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
287:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
288:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
289:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
290:   dA->d_v = (PetscScalar*)a;
291:   dA->user_alloc = PETSC_FALSE;
292:   return(0);
293: }

295: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
296: {
297:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
298:   PetscErrorCode   ierr;

301:   if (!dA->d_v) {
302:     MatSeqDenseCUDASetPreallocation(A,NULL);
303:   }
304:   *a = dA->d_v;
305:   return(0);
306: }

308: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
309: {
311:   *a = NULL;
312:   return(0);
313: }

315: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
316: {
317:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
318:   PetscErrorCode   ierr;

321:   MatSeqDenseCUDACopyToGPU(A);
322:   *a   = dA->d_v;
323:   return(0);
324: }

326: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
327: {
329:   *a = NULL;
330:   return(0);
331: }

333: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
334: {
335:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
336:   PetscErrorCode   ierr;

339:   MatSeqDenseCUDACopyToGPU(A);
340:   *a   = dA->d_v;
341:   return(0);
342: }

344: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
345: {
347:   *a = NULL;
348:   return(0);
349: }

351: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
352: {
353: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
354:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
355:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
356:   PetscScalar        *da;
357:   PetscErrorCode     ierr;
358:   cudaError_t        ccer;
359:   cusolverStatus_t   cerr;
360:   cusolverDnHandle_t handle;
361:   PetscCuBLASInt     n,lda;
362: #if defined(PETSC_USE_DEBUG)
363:   PetscCuBLASInt     info;
364: #endif

367:   if (!A->rmap->n || !A->cmap->n) return(0);
368:   PetscCUSOLVERDnGetHandle(&handle);
369:   PetscCuBLASIntCast(A->cmap->n,&n);
370:   PetscCuBLASIntCast(a->lda,&lda);
371:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
372:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
373:     if (!dA->d_fact_ipiv) { /* spd */
374:       PetscCuBLASInt il;

376:       MatDenseCUDAGetArray(A,&da);
377:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
378:       if (il > dA->fact_lwork) {
379:         dA->fact_lwork = il;

381:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
382:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
383:       }
384:       PetscLogGpuTimeBegin();
385:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
386:       PetscLogGpuTimeEnd();
387:       MatDenseCUDARestoreArray(A,&da);
388:       /* TODO (write cuda kernel) */
389:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
390:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
391:   }
392: #if defined(PETSC_USE_DEBUG)
393:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
394:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
395:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
396: #endif
397:   PetscLogGpuFlops(1.0*n*n*n/3.0);
398:   A->ops->solve          = NULL;
399:   A->ops->solvetranspose = NULL;
400:   A->ops->matsolve       = NULL;
401:   A->factortype          = MAT_FACTOR_NONE;

403:   PetscFree(A->solvertype);
404:   return(0);
405: #else
406:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
407: #endif
408: }

410: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose,
411:                                                      PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
412: {
413:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
414:   PetscScalar      *y;
415:   PetscCuBLASInt   m=0, k=0;
416:   PetscBool        xiscuda, yiscuda, aiscuda;
417:   cudaError_t      cerr;
418:   PetscErrorCode   ierr;

421:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
422:   PetscCuBLASIntCast(A->rmap->n,&m);
423:   PetscCuBLASIntCast(A->cmap->n,&k);
424:   PetscObjectTypeCompare((PetscObject)xx,VECSEQCUDA,&xiscuda);
425:   PetscObjectTypeCompare((PetscObject)yy,VECSEQCUDA,&yiscuda);
426:   {
427:     const PetscScalar *x;
428:     PetscBool xishost = PETSC_TRUE;

430:     /* The logic here is to try to minimize the amount of memory copying:
431:        if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
432:        data is not offloaded to the GPU yet, then the data is copied to the
433:        GPU.  But we are only trying to get the data in order to copy it into the y
434:        array.  So the array x will be wherever the data already is so that
435:        only one memcpy is performed */
436:     if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
437:       VecCUDAGetArrayRead(xx, &x);
438:       xishost =  PETSC_FALSE;
439:     } else {
440:       VecGetArrayRead(xx, &x);
441:     }
442:     if (k < m || !yiscuda) {
443:       if (!dA->workvec) {
444:         VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
445:       }
446:       VecCUDAGetArrayWrite(dA->workvec, &y);
447:     } else {
448:       VecCUDAGetArrayWrite(yy,&y);
449:     }
450:     cerr = cudaMemcpy(y,x,m*sizeof(PetscScalar),xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
451:   }
452:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
453:   if (!aiscuda) {
454:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
455:   }
456:   (*matsolve) (A, y, m, m, 1, k, transpose);
457:   if (!aiscuda) {
458:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
459:   }
460:   if (k < m || !yiscuda) {
461:     PetscScalar *yv;

463:     /* The logic here is that the data is not yet in either yy's GPU array or its
464:        CPU array.  There is nothing in the interface to say where the user would like
465:        it to end up.  So we choose the GPU, because it is the faster option */
466:     if (yiscuda) {
467:       VecCUDAGetArrayWrite(yy,&yv);
468:     } else {
469:       VecGetArray(yy,&yv);
470:     }
471:     cerr = cudaMemcpy(yv,y,k*sizeof(PetscScalar),yiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
472:     if (yiscuda) {
473:       VecCUDARestoreArrayWrite(yy,&yv);
474:     } else {
475:       VecRestoreArray(yy,&yv);
476:     }
477:     VecCUDARestoreArrayWrite(dA->workvec, &y);
478:   } else {
479:     VecCUDARestoreArrayWrite(yy,&y);
480:   }
481:   return(0);
482: }

484: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose,
485:                                                         PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
486: {
487:   PetscScalar       *y;
488:   PetscInt          n, _ldb, _ldx;
489:   PetscBool         biscuda, xiscuda, aiscuda;
490:   PetscCuBLASInt    nrhs=0,m=0,k=0,ldb=0,ldx=0,ldy=0;
491:   cudaError_t       cerr;
492:   PetscErrorCode    ierr;

495:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
496:   PetscCuBLASIntCast(A->rmap->n,&m);
497:   PetscCuBLASIntCast(A->cmap->n,&k);
498:   MatGetSize(B,NULL,&n);
499:   PetscCuBLASIntCast(n,&nrhs);
500:   MatDenseGetLDA(B,&_ldb);
501:   PetscCuBLASIntCast(_ldb, &ldb);
502:   MatDenseGetLDA(X,&_ldx);
503:   PetscCuBLASIntCast(_ldx, &ldx);

505:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
506:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&xiscuda);
507:   {
508:     /* The logic here is to try to minimize the amount of memory copying:
509:        if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
510:        data is not offloaded to the GPU yet, then the data is copied to the
511:        GPU.  But we are only trying to get the data in order to copy it into the y
512:        array.  So the array b will be wherever the data already is so that
513:        only one memcpy is performed */
514:     const PetscScalar *b;

516:     /* some copying from B will be involved */
517:     PetscBool bishost = PETSC_TRUE;

519:     if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
520:       MatDenseCUDAGetArrayRead(B,&b);
521:       bishost = PETSC_FALSE;
522:     } else {
523:       MatDenseGetArrayRead(B,&b);
524:     }
525:     if (ldx < m || !xiscuda) {
526:       /* X's array cannot serve as the array (too small or not on device), B's
527:        * array cannot serve as the array (const), so allocate a new array  */
528:       ldy = m;
529:       cerr = cudaMalloc((void**)&y,nrhs*m*sizeof(PetscScalar));CHKERRCUDA(cerr);
530:     } else {
531:       /* X's array should serve as the array */
532:       ldy = ldx;
533:       MatDenseCUDAGetArrayWrite(X,&y);
534:     }
535:     cerr = cudaMemcpy2D(y,ldy*sizeof(PetscScalar),b,ldb*sizeof(PetscScalar),m*sizeof(PetscScalar),nrhs,bishost ? cudaMemcpyHostToDevice: cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
536:     if (bishost) {
537:       MatDenseRestoreArrayRead(B,&b);
538:     } else {
539:       MatDenseCUDARestoreArrayRead(B,&b);
540:     }
541:   }
542:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
543:   if (!aiscuda) {
544:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
545:   }
546:   (*matsolve) (A, y, ldy, m, nrhs, k, transpose);
547:   if (!aiscuda) {
548:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
549:   }
550:   if (ldx < m || !xiscuda) {
551:     PetscScalar *x;

553:     /* The logic here is that the data is not yet in either X's GPU array or its
554:        CPU array.  There is nothing in the interface to say where the user would like
555:        it to end up.  So we choose the GPU, because it is the faster option */
556:     if (xiscuda) {
557:       MatDenseCUDAGetArrayWrite(X,&x);
558:     } else {
559:       MatDenseGetArray(X,&x);
560:     }
561:     cerr = cudaMemcpy2D(x,ldx*sizeof(PetscScalar),y,ldy*sizeof(PetscScalar),k*sizeof(PetscScalar),nrhs,xiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
562:     if (xiscuda) {
563:       MatDenseCUDARestoreArrayWrite(X,&x);
564:     } else {
565:       MatDenseRestoreArray(X,&x);
566:     }
567:     cerr = cudaFree(y);CHKERRCUDA(cerr);
568:   } else {
569:     MatDenseCUDARestoreArrayWrite(X,&y);
570:   }
571:   return(0);
572: }

574: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
575: {
576:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
577:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
578:   const PetscScalar  *da;
579:   PetscCuBLASInt     lda;
580:   cusolverDnHandle_t handle;
581:   cudaError_t        ccer;
582:   cusolverStatus_t   cerr;
583:   int                info;
584:   PetscErrorCode     ierr;

587:   PetscCuBLASIntCast(mat->lda,&lda);
588:   MatDenseCUDAGetArrayRead(A,&da);
589:   PetscCUSOLVERDnGetHandle(&handle);
590:   PetscLogGpuTimeBegin();
591:   PetscInfo2(A,"LU solve %d x %d on backend\n",m,k);
592:   cerr = cusolverDnXgetrs(handle,T ? CUBLAS_OP_T : CUBLAS_OP_N,m,nrhs,da,lda,dA->d_fact_ipiv,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
593:   PetscLogGpuTimeEnd();
594:   MatDenseCUDARestoreArrayRead(A,&da);
595:   if (PetscDefined(USE_DEBUG)) {
596:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
597:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
598:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
599:   }
600:   PetscLogGpuFlops(nrhs*(2.0*m*m - m));
601:   return(0);
602: }

604: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
605: {
606:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
607:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
608:   const PetscScalar  *da;
609:   PetscCuBLASInt     lda;
610:   cusolverDnHandle_t handle;
611:   cudaError_t        ccer;
612:   cusolverStatus_t   cerr;
613:   int                info;
614:   PetscErrorCode     ierr;

617:   PetscCuBLASIntCast(mat->lda,&lda);
618:   MatDenseCUDAGetArrayRead(A,&da);
619:   PetscCUSOLVERDnGetHandle(&handle);
620:   PetscLogGpuTimeBegin();
621:   PetscInfo2(A,"Cholesky solve %d x %d on backend\n",m,k);
622:   if (!dA->d_fact_ipiv) { /* spd */
623:     /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
624:     cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,m,nrhs,da,lda,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
625:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
626:   PetscLogGpuTimeEnd();
627:   MatDenseCUDARestoreArrayRead(A,&da);
628:   if (PetscDefined(USE_DEBUG)) {
629:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
630:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
631:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
632:   }
633:   PetscLogGpuFlops(nrhs*(2.0*m*m - m));
634:   return(0);
635: }

637: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
638: {
639:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
640:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
641:   const PetscScalar  *da;
642:   PetscCuBLASInt     lda, rank;
643:   cusolverDnHandle_t handle;
644:   cublasHandle_t     bhandle;
645:   cudaError_t        ccer;
646:   cusolverStatus_t   csrr;
647:   cublasStatus_t     cbrr;
648:   int                info;
649:   cublasOperation_t  trans;
650:   PetscScalar        one = 1.;
651:   PetscErrorCode     ierr;

654:   PetscCuBLASIntCast(mat->lda,&lda);
655:   PetscCuBLASIntCast(mat->rank,&rank);
656:   MatDenseCUDAGetArrayRead(A,&da);
657:   PetscCUSOLVERDnGetHandle(&handle);
658:   PetscCUBLASGetHandle(&bhandle);
659:   PetscLogGpuTimeBegin();
660:   PetscInfo2(A,"QR solve %d x %d on backend\n",m,k);
661:   if (!T) {
662:     if (PetscDefined(USE_COMPLEX)) {
663:       trans = CUBLAS_OP_C;
664:     } else {
665:       trans = CUBLAS_OP_T;
666:     }
667:     csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
668:     if (PetscDefined(USE_DEBUG)) {
669:       ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
670:       if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
671:     }
672:     cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
673:   } else {
674:     cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
675:     csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
676:     if (PetscDefined(USE_DEBUG)) {
677:       ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
678:       if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
679:     }
680:   }
681:   PetscLogGpuTimeEnd();
682:   MatDenseCUDARestoreArrayRead(A,&da);
683:   PetscLogFlops(nrhs*(4.0*m*mat->rank - PetscSqr(mat->rank)));
684:   return(0);
685: }

687: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
688: {

692:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
693:   return(0);
694: }

696: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
697: {

701:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
702:   return(0);
703: }

705: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
706: {

710:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
711:   return(0);
712: }

714: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
715: {

719:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
720:   return(0);
721: }

723: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
724: {

728:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
729:   return(0);
730: }

732: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
733: {

737:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
738:   return(0);
739: }

741: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
742: {

746:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
747:   return(0);
748: }

750: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
751: {

755:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
756:   return(0);
757: }

759: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
760: {

764:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
765:   return(0);
766: }

768: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
769: {

773:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
774:   return(0);
775: }

777: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
778: {

782:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
783:   return(0);
784: }

786: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
787: {

791:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
792:   return(0);
793: }

795: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
796: {
797:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
798:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
799:   PetscScalar        *da;
800:   PetscCuBLASInt     m,n,lda;
801: #if defined(PETSC_USE_DEBUG)
802:   int                info;
803: #endif
804:   cusolverStatus_t   cerr;
805:   cusolverDnHandle_t handle;
806:   cudaError_t        ccer;
807:   PetscErrorCode     ierr;

810:   if (!A->rmap->n || !A->cmap->n) return(0);
811:   PetscCUSOLVERDnGetHandle(&handle);
812:   MatDenseCUDAGetArray(A,&da);
813:   PetscCuBLASIntCast(A->cmap->n,&n);
814:   PetscCuBLASIntCast(A->rmap->n,&m);
815:   PetscCuBLASIntCast(a->lda,&lda);
816:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
817:   if (!dA->d_fact_ipiv) {
818:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
819:   }
820:   if (!dA->fact_lwork) {
821:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
822:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
823:   }
824:   if (!dA->d_fact_info) {
825:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
826:   }
827:   PetscLogGpuTimeBegin();
828:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
829:   PetscLogGpuTimeEnd();
830:   MatDenseCUDARestoreArray(A,&da);
831: #if defined(PETSC_USE_DEBUG)
832:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
833:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
834:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
835: #endif
836:   A->factortype = MAT_FACTOR_LU;
837:   PetscLogGpuFlops(2.0*n*n*m/3.0);

839:   A->ops->solve             = MatSolve_SeqDenseCUDA_LU;
840:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_LU;
841:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_LU;
842:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;

844:   PetscFree(A->solvertype);
845:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
846:   return(0);
847: }

849: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
850: {
851:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
852:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
853:   PetscScalar        *da;
854:   PetscCuBLASInt     n,lda;
855: #if defined(PETSC_USE_DEBUG)
856:   int                info;
857: #endif
858:   cusolverStatus_t   cerr;
859:   cusolverDnHandle_t handle;
860:   cudaError_t        ccer;
861:   PetscErrorCode     ierr;

864:   if (!A->rmap->n || !A->cmap->n) return(0);
865:   PetscCUSOLVERDnGetHandle(&handle);
866:   PetscCuBLASIntCast(A->rmap->n,&n);
867:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
868:   if (A->spd) {
869:     MatDenseCUDAGetArray(A,&da);
870:     PetscCuBLASIntCast(a->lda,&lda);
871:     if (!dA->fact_lwork) {
872:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
873:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
874:     }
875:     if (!dA->d_fact_info) {
876:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
877:     }
878:     PetscLogGpuTimeBegin();
879:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
880:     PetscLogGpuTimeEnd();

882:     MatDenseCUDARestoreArray(A,&da);
883: #if defined(PETSC_USE_DEBUG)
884:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
885:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
886:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
887: #endif
888:     A->factortype = MAT_FACTOR_CHOLESKY;
889:     PetscLogGpuFlops(1.0*n*n*n/3.0);
890:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
891: #if 0
892:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
893:        The code below should work, and it can be activated when *sytrs routines will be available */
894:     if (!dA->d_fact_ipiv) {
895:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
896:     }
897:     if (!dA->fact_lwork) {
898:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
899:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
900:     }
901:     if (!dA->d_fact_info) {
902:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
903:     }
904:     PetscLogGpuTimeBegin();
905:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
906:     PetscLogGpuTimeEnd();
907: #endif

909:   A->ops->solve             = MatSolve_SeqDenseCUDA_Cholesky;
910:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_Cholesky;
911:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_Cholesky;
912:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
913:   PetscFree(A->solvertype);
914:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
915:   return(0);
916: }

918: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A,IS col,const MatFactorInfo *factinfo)
919: {
920:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
921:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
922:   PetscScalar        *da;
923:   PetscCuBLASInt     m,min,max,n,lda;
924: #if defined(PETSC_USE_DEBUG)
925:   int                info;
926: #endif
927:   cusolverStatus_t   cerr;
928:   cusolverDnHandle_t handle;
929:   cudaError_t        ccer;
930:   PetscErrorCode     ierr;

933:   if (!A->rmap->n || !A->cmap->n) return(0);
934:   PetscCUSOLVERDnGetHandle(&handle);
935:   MatDenseCUDAGetArray(A,&da);
936:   PetscCuBLASIntCast(A->cmap->n,&n);
937:   PetscCuBLASIntCast(A->rmap->n,&m);
938:   PetscCuBLASIntCast(a->lda,&lda);
939:   PetscInfo2(A,"QR factor %d x %d on backend\n",m,n);
940:   max = PetscMax(m,n);
941:   min = PetscMin(m,n);
942:   if (!dA->d_fact_tau) {
943:     ccer = cudaMalloc((void**)&dA->d_fact_tau,min*sizeof(*dA->d_fact_tau));CHKERRCUDA(ccer);
944:   }
945:   if (!dA->d_fact_ipiv) {
946:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
947:   }
948:   if (!dA->fact_lwork) {
949:     cerr = cusolverDnXgeqrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
950:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
951:   }
952:   if (!dA->d_fact_info) {
953:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
954:   }
955:   if (!dA->workvec) {
956:     VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
957:   }
958:   PetscLogGpuTimeBegin();
959:   cerr = cusolverDnXgeqrf(handle,m,n,da,lda,dA->d_fact_tau,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
960:   PetscLogGpuTimeEnd();
961:   MatDenseCUDARestoreArray(A,&da);
962: #if defined(PETSC_USE_DEBUG)
963:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
964:   if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
965: #endif
966:   A->factortype = MAT_FACTOR_QR;
967:   a->rank = min;
968:   PetscLogGpuFlops(2.0*min*min*(max-min/3.0));

970:   A->ops->solve             = MatSolve_SeqDenseCUDA_QR;
971:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_QR;
972:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_QR;
973:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;

975:   PetscFree(A->solvertype);
976:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
977:   return(0);
978: }

980: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
981: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
982: {
983:   const PetscScalar *da,*db;
984:   PetscScalar       *dc;
985:   PetscScalar       one=1.0,zero=0.0;
986:   PetscCuBLASInt    m,n,k;
987:   PetscInt          alda,blda,clda;
988:   PetscErrorCode    ierr;
989:   cublasHandle_t    cublasv2handle;
990:   PetscBool         Aiscuda,Biscuda;
991:   cublasStatus_t    berr;

994:   /* we may end up with SEQDENSE as one of the arguments */
995:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
996:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
997:   if (!Aiscuda) {
998:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
999:   }
1000:   if (!Biscuda) {
1001:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1002:   }
1003:   PetscCuBLASIntCast(C->rmap->n,&m);
1004:   PetscCuBLASIntCast(C->cmap->n,&n);
1005:   if (tA) {
1006:     PetscCuBLASIntCast(A->rmap->n,&k);
1007:   } else {
1008:     PetscCuBLASIntCast(A->cmap->n,&k);
1009:   }
1010:   if (!m || !n || !k) return(0);
1011:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
1012:   MatDenseCUDAGetArrayRead(A,&da);
1013:   MatDenseCUDAGetArrayRead(B,&db);
1014:   MatDenseCUDAGetArrayWrite(C,&dc);
1015:   MatDenseGetLDA(A,&alda);
1016:   MatDenseGetLDA(B,&blda);
1017:   MatDenseGetLDA(C,&clda);
1018:   PetscCUBLASGetHandle(&cublasv2handle);
1019:   PetscLogGpuTimeBegin();
1020:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
1021:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
1022:   PetscLogGpuTimeEnd();
1023:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
1024:   MatDenseCUDARestoreArrayRead(A,&da);
1025:   MatDenseCUDARestoreArrayRead(B,&db);
1026:   MatDenseCUDARestoreArrayWrite(C,&dc);
1027:   if (!Aiscuda) {
1028:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1029:   }
1030:   if (!Biscuda) {
1031:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1032:   }
1033:   return(0);
1034: }

1036: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1037: {

1041:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
1042:   return(0);
1043: }

1045: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1046: {

1050:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
1051:   return(0);
1052: }

1054: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1055: {

1059:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
1060:   return(0);
1061: }

1063: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
1064: {

1068:   MatProductSetFromOptions_SeqDense(C);
1069:   return(0);
1070: }

1072: /* zz = op(A)*xx + yy
1073:    if yy == NULL, only MatMult */
1074: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
1075: {
1076:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
1077:   const PetscScalar *xarray,*da;
1078:   PetscScalar       *zarray;
1079:   PetscScalar       one=1.0,zero=0.0;
1080:   PetscCuBLASInt    m, n, lda;
1081:   cublasHandle_t    cublasv2handle;
1082:   cublasStatus_t    berr;
1083:   PetscErrorCode    ierr;

1086:   if (yy && yy != zz) { /* mult add */
1087:     VecCopy_SeqCUDA(yy,zz);
1088:   }
1089:   if (!A->rmap->n || !A->cmap->n) {
1090:     if (!yy) { /* mult only */
1091:       VecSet_SeqCUDA(zz,0.0);
1092:     }
1093:     return(0);
1094:   }
1095:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
1096:   PetscCuBLASIntCast(A->rmap->n,&m);
1097:   PetscCuBLASIntCast(A->cmap->n,&n);
1098:   PetscCuBLASIntCast(mat->lda,&lda);
1099:   PetscCUBLASGetHandle(&cublasv2handle);
1100:   MatDenseCUDAGetArrayRead(A,&da);
1101:   VecCUDAGetArrayRead(xx,&xarray);
1102:   VecCUDAGetArray(zz,&zarray);
1103:   PetscLogGpuTimeBegin();
1104:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
1105:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
1106:   PetscLogGpuTimeEnd();
1107:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
1108:   VecCUDARestoreArrayRead(xx,&xarray);
1109:   VecCUDARestoreArray(zz,&zarray);
1110:   MatDenseCUDARestoreArrayRead(A,&da);
1111:   return(0);
1112: }

1114: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1115: {

1119:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
1120:   return(0);
1121: }

1123: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1124: {

1128:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
1129:   return(0);
1130: }

1132: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1133: {

1137:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
1138:   return(0);
1139: }

1141: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1142: {

1146:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
1147:   return(0);
1148: }

1150: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
1151: {
1152:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1156:   MatSeqDenseCUDACopyFromGPU(A);
1157:   *array = mat->v;
1158:   return(0);
1159: }

1161: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
1162: {
1163:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1167:   if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
1168:     MatSeqDenseSetPreallocation(A,NULL);
1169:   }
1170:   *array = mat->v;
1171:   A->offloadmask = PETSC_OFFLOAD_CPU;
1172:   return(0);
1173: }

1175: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
1176: {
1177:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1181:   MatSeqDenseCUDACopyFromGPU(A);
1182:   *array = mat->v;
1183:   A->offloadmask = PETSC_OFFLOAD_CPU;
1184:   return(0);
1185: }

1187: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
1188: {
1189:   Mat_SeqDense   *y = (Mat_SeqDense*)Y->data;
1190:   PetscScalar    *dy;
1191:   PetscCuBLASInt j,N,m,lday,one = 1;
1192:   cublasHandle_t cublasv2handle;
1193:   cublasStatus_t berr;

1197:   PetscCUBLASGetHandle(&cublasv2handle);
1198:   MatDenseCUDAGetArray(Y,&dy);
1199:   PetscCuBLASIntCast(Y->rmap->n*Y->cmap->n,&N);
1200:   PetscCuBLASIntCast(Y->rmap->n,&m);
1201:   PetscCuBLASIntCast(y->lda,&lday);
1202:   PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1203:   PetscLogGpuTimeBegin();
1204:   if (lday>m) {
1205:     for (j=0; j<Y->cmap->n; j++) {
1206:       berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
1207:     }
1208:   } else {
1209:     berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
1210:   }
1211:   PetscLogGpuTimeEnd();
1212:   PetscLogGpuFlops(N);
1213:   MatDenseCUDARestoreArray(Y,&dy);
1214:   return(0);
1215: }

1217: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
1218: {
1219:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
1220:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
1221:   const PetscScalar *dx;
1222:   PetscScalar       *dy;
1223:   PetscCuBLASInt    j,N,m,ldax,lday,one = 1;
1224:   cublasHandle_t    cublasv2handle;
1225:   cublasStatus_t    berr;
1226:   PetscErrorCode    ierr;

1229:   if (!X->rmap->n || !X->cmap->n) return(0);
1230:   PetscCUBLASGetHandle(&cublasv2handle);
1231:   MatDenseCUDAGetArrayRead(X,&dx);
1232:   if (alpha != 0.0) {
1233:     MatDenseCUDAGetArray(Y,&dy);
1234:   } else {
1235:     MatDenseCUDAGetArrayWrite(Y,&dy);
1236:   }
1237:   PetscCuBLASIntCast(X->rmap->n*X->cmap->n,&N);
1238:   PetscCuBLASIntCast(X->rmap->n,&m);
1239:   PetscCuBLASIntCast(x->lda,&ldax);
1240:   PetscCuBLASIntCast(y->lda,&lday);
1241:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1242:   PetscLogGpuTimeBegin();
1243:   if (ldax>m || lday>m) {
1244:     for (j=0; j<X->cmap->n; j++) {
1245:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
1246:     }
1247:   } else {
1248:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
1249:   }
1250:   PetscLogGpuTimeEnd();
1251:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
1252:   MatDenseCUDARestoreArrayRead(X,&dx);
1253:   if (alpha != 0.0) {
1254:     MatDenseCUDARestoreArray(Y,&dy);
1255:   } else {
1256:     MatDenseCUDARestoreArrayWrite(Y,&dy);
1257:   }
1258:   return(0);
1259: }

1261: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1262: {
1263:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1264:   cudaError_t      cerr;
1265:   PetscErrorCode   ierr;

1268:   if (dA) {
1269:     if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
1270:     if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
1271:     cerr = cudaFree(dA->d_fact_tau);CHKERRCUDA(cerr);
1272:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
1273:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
1274:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
1275:     VecDestroy(&dA->workvec);
1276:   }
1277:   PetscFree(A->spptr);
1278:   return(0);
1279: }

1281: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1282: {
1283:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1287:   /* prevent to copy back data if we own the data pointer */
1288:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
1289:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1290:   MatDestroy_SeqDense(A);
1291:   return(0);
1292: }

1294: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
1295: {
1296:   PetscErrorCode     ierr;
1297:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1300:   MatCreate(PetscObjectComm((PetscObject)A),B);
1301:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1302:   MatSetType(*B,((PetscObject)A)->type_name);
1303:   MatDuplicateNoCreate_SeqDense(*B,A,hcpvalues);
1304:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) {
1305:     MatCopy_SeqDenseCUDA(A,*B,SAME_NONZERO_PATTERN);
1306:   }
1307:   return(0);
1308: }

1310: #include <petsc/private/vecimpl.h>

1312: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1313: {
1314:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1315:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1316:   PetscErrorCode   ierr;
1317:   PetscScalar      *x;
1318:   PetscBool        viscuda;
1319:   cudaError_t      cerr;

1322:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1323:   if (viscuda && !v->boundtocpu) { /* update device data */
1324:     VecCUDAGetArrayWrite(v,&x);
1325:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1326:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1327:     } else {
1328:       cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1329:     }
1330:     VecCUDARestoreArrayWrite(v,&x);
1331:   } else { /* update host data */
1332:     VecGetArrayWrite(v,&x);
1333:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1334:       PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1335:     } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1336:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1337:     }
1338:     VecRestoreArrayWrite(v,&x);
1339:   }
1340:   return(0);
1341: }

1343: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1344: {

1348:   MatCreate(PetscObjectComm((PetscObject)A),fact);
1349:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1350:   MatSetType(*fact,MATSEQDENSECUDA);
1351:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1352:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1353:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1354:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1355:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1356:   } else if (ftype == MAT_FACTOR_QR) {
1357:     PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactor_C",MatQRFactor_SeqDense);
1358:     PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactorSymbolic_C",MatQRFactorSymbolic_SeqDense);
1359:   }
1360:   (*fact)->factortype = ftype;
1361:   PetscFree((*fact)->solvertype);
1362:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1363:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1364:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1365:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1366:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1367:   return(0);
1368: }

1370: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1371: {
1372:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1376:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1377:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1378:   MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1379:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1380:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1381:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1382:   }
1383:   a->vecinuse = col + 1;
1384:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1385:   *v   = a->cvec;
1386:   return(0);
1387: }

1389: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1390: {
1391:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1395:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1396:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1397:   a->vecinuse = 0;
1398:   VecCUDAResetArray(a->cvec);
1399:   MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1400:   *v   = NULL;
1401:   return(0);
1402: }

1404: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1405: {
1406:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1410:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1411:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1412:   MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1413:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1414:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1415:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1416:   }
1417:   a->vecinuse = col + 1;
1418:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1419:   VecLockReadPush(a->cvec);
1420:   *v   = a->cvec;
1421:   return(0);
1422: }

1424: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1425: {
1426:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1430:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1431:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1432:   a->vecinuse = 0;
1433:   VecLockReadPop(a->cvec);
1434:   VecCUDAResetArray(a->cvec);
1435:   MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1436:   *v   = NULL;
1437:   return(0);
1438: }

1440: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1441: {
1442:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1446:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1447:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1448:   MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1449:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1450:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1451:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1452:   }
1453:   a->vecinuse = col + 1;
1454:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1455:   *v   = a->cvec;
1456:   return(0);
1457: }

1459: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1460: {
1461:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1465:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1466:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1467:   a->vecinuse = 0;
1468:   VecCUDAResetArray(a->cvec);
1469:   MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1470:   *v   = NULL;
1471:   return(0);
1472: }

1474: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1475: {
1476:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1477:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1478:   PetscErrorCode   ierr;

1481:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1482:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1483:   if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1484:     MatDestroy(&a->cmat);
1485:   }
1486:   MatSeqDenseCUDACopyToGPU(A);
1487:   if (!a->cmat) {
1488:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1489:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1490:   } else {
1491:     MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1492:   }
1493:   MatDenseSetLDA(a->cmat,a->lda);
1494:   if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1495:   a->cmat->offloadmask = A->offloadmask;
1496:   a->matinuse = cbegin + 1;
1497:   *v = a->cmat;
1498:   return(0);
1499: }

1501: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1502: {
1503:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1507:   if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1508:   if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1509:   if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1510:   a->matinuse = 0;
1511:   A->offloadmask = PETSC_OFFLOAD_GPU;
1512:   MatDenseCUDAResetArray(a->cmat);
1513:   MatDenseResetArray(a->cmat);
1514:   *v = NULL;
1515:   return(0);
1516: }

1518: static PetscErrorCode  MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1519: {
1520:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
1521:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1522:   PetscBool        data;

1525:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1526:   if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1527:   if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1528:   cA->lda = lda;
1529:   return(0);
1530: }

1532: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1533: {
1534:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1538:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1539:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1540:   A->boundtocpu = flg;
1541:   if (!flg) {
1542:     PetscBool iscuda;

1544:     PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1545:     if (!iscuda) {
1546:       VecDestroy(&a->cvec);
1547:     }
1548:     PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1549:     if (!iscuda) {
1550:       MatDestroy(&a->cmat);
1551:     }
1552:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1553:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1554:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1555:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1556:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1557:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1558:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1559:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1560:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1561:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1562:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1563:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1564:     PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDenseCUDA);

1566:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1567:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1568:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1569:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1570:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1571:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1572:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1573:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1574:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1575:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1576:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1577:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1578:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1579:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1580:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1581:     A->ops->zeroentries             = MatZeroEntries_SeqDenseCUDA;
1582:   } else {
1583:     /* make sure we have an up-to-date copy on the CPU */
1584:     MatSeqDenseCUDACopyFromGPU(A);
1585:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1586:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1587:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1588:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1589:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1590:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1591:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1592:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1593:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1594:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1595:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1596:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1597:     PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDense);

1599:     A->ops->duplicate               = MatDuplicate_SeqDense;
1600:     A->ops->mult                    = MatMult_SeqDense;
1601:     A->ops->multadd                 = MatMultAdd_SeqDense;
1602:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1603:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1604:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1605:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1606:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1607:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1608:     A->ops->axpy                    = MatAXPY_SeqDense;
1609:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1610:     A->ops->lufactor                = MatLUFactor_SeqDense;
1611:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1612:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1613:     A->ops->scale                   = MatScale_SeqDense;
1614:     A->ops->copy                    = MatCopy_SeqDense;
1615:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1616:   }
1617:   if (a->cmat) {
1618:     MatBindToCPU(a->cmat,flg);
1619:   }
1620:   return(0);
1621: }

1623: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1624: {
1625:   Mat              B;
1626:   PetscErrorCode   ierr;

1629:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1630:     /* TODO these cases should be optimized */
1631:     MatConvert_Basic(M,type,reuse,newmat);
1632:     return(0);
1633:   }

1635:   B    = *newmat;
1636:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1637:   MatReset_SeqDenseCUDA(B);
1638:   PetscFree(B->defaultvectype);
1639:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1640:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1641:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1642:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1643:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1644:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1645:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1646:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1647:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1648:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1649:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1650:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1651:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);

1653:   B->ops->bindtocpu = NULL;
1654:   B->ops->destroy = MatDestroy_SeqDense;
1655:   B->offloadmask = PETSC_OFFLOAD_CPU;
1656:   return(0);
1657: }

1659: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1660: {
1661:   Mat_SeqDenseCUDA *dB;
1662:   Mat              B;
1663:   PetscErrorCode   ierr;

1666:   PetscCUDAInitializeCheck();
1667:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1668:     /* TODO these cases should be optimized */
1669:     MatConvert_Basic(M,type,reuse,newmat);
1670:     return(0);
1671:   }

1673:   B    = *newmat;
1674:   PetscFree(B->defaultvectype);
1675:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1676:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1677:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",            MatConvert_SeqDenseCUDA_SeqDense);
1678:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",                        MatDenseCUDAGetArray_SeqDenseCUDA);
1679:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",                    MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1680:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",                   MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1681:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",                    MatDenseCUDARestoreArray_SeqDenseCUDA);
1682:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",                MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1683:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",               MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1684:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",                      MatDenseCUDAPlaceArray_SeqDenseCUDA);
1685:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",                      MatDenseCUDAResetArray_SeqDenseCUDA);
1686:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",                    MatDenseCUDAReplaceArray_SeqDenseCUDA);
1687:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);

1689:   PetscNewLog(B,&dB);
1690:   B->spptr = dB;

1692:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1694:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1695:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1696:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1697:   return(0);
1698: }

1700: /*@C
1701:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1703:    Collective

1705:    Input Parameters:
1706: +  comm - MPI communicator
1707: .  m - number of rows
1708: .  n - number of columns
1709: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1710:    to control matrix memory allocation.

1712:    Output Parameter:
1713: .  A - the matrix

1715:    Notes:

1717:    Level: intermediate

1719: .seealso: MatCreate(), MatCreateSeqDense()
1720: @*/
1721: PetscErrorCode  MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1722: {
1724:   PetscMPIInt    size;

1727:   MPI_Comm_size(comm,&size);
1728:   if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1729:   MatCreate(comm,A);
1730:   MatSetSizes(*A,m,n,m,n);
1731:   MatSetType(*A,MATSEQDENSECUDA);
1732:   MatSeqDenseCUDASetPreallocation(*A,data);
1733:   return(0);
1734: }

1736: /*MC
1737:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1739:    Options Database Keys:
1740: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1742:   Level: beginner
1743: M*/
1744: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1745: {

1749:   PetscCUDAInitializeCheck();
1750:   MatCreate_SeqDense(B);
1751:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1752:   return(0);
1753: }