Actual source code: densecuda.cu

  1: /*
  2:      Defines the matrix operations for sequential dense with CUDA
  3: */
  4: #include <petscpkg_version.h>
  5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  6: #include <../src/mat/impls/dense/seq/dense.h>
  7: #include <petsc/private/cudavecimpl.h>

  9: #if defined(PETSC_USE_COMPLEX)
 10: #if defined(PETSC_USE_REAL_SINGLE)
 11: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnCpotrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 12: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnCpotrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 13: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnCpotrs((a),(b),(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g),(h),(i))
 14: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnCpotri((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 15: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnCpotri_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 16: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnCsytrf((a),(b),(c),(cuComplex*)(d),(e),(f),(cuComplex*)(g),(h),(i))
 17: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnCsytrf_bufferSize((a),(b),(cuComplex*)(c),(d),(e))
 18: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnCgetrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g),(h))
 19: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnCgetrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 20: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnCgetrs((a),(b),(c),(d),(cuComplex*)(e),(f),(g),(cuComplex*)(h),(i),(j))
 21: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnCgeqrf_bufferSize((a),(b),(c),(cuComplex*)(d),(e),(f))
 22: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnCgeqrf((a),(b),(c),(cuComplex*)(d),(e),(cuComplex*)(f),(cuComplex*)(g),(h),(i))
 23: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnCunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(l))
 24: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnCunmqr((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(cuComplex*)(l),(m),(n))
 25: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasCtrsm((a),(b),(c),(d),(e),(f),(g),(cuComplex*)(h),(cuComplex*)(i),(j),(cuComplex*)(k),(l))
 26: #else /* complex double */
 27: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnZpotrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 28: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnZpotrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 29: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnZpotrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g),(h),(i))
 30: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnZpotri((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 31: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnZpotri_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 32: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnZsytrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(f),(cuDoubleComplex*)(g),(h),(i))
 33: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnZsytrf_bufferSize((a),(b),(cuDoubleComplex*)(c),(d),(e))
 34: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnZgetrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g),(h))
 35: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnZgetrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 36: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnZgetrs((a),(b),(c),(d),(cuDoubleComplex*)(e),(f),(g),(cuDoubleComplex*)(h),(i),(j))
 37: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnZgeqrf_bufferSize((a),(b),(c),(cuDoubleComplex*)(d),(e),(f))
 38: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnZgeqrf((a),(b),(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(cuDoubleComplex*)(g),(h),(i))
 39: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnZunmqr_bufferSize((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(l))
 40: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnZunmqr((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(m),(n))
 41: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasZtrsm((a),(b),(c),(d),(e),(f),(g),(cuDoubleComplex*)(h),(cuDoubleComplex*)(i),(j),(cuDoubleComplex*)(k),(l))
 42: #endif
 43: #else /* real single */
 44: #if defined(PETSC_USE_REAL_SINGLE)
 45: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnSpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 46: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnSpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 47: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnSpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 48: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnSpotri((a),(b),(c),(d),(e),(f),(g),(h))
 49: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnSpotri_bufferSize((a),(b),(c),(d),(e),(f))
 50: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnSsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 51: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnSsytrf_bufferSize((a),(b),(c),(d),(e))
 52: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnSgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 53: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnSgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 54: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnSgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 55: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnSgeqrf_bufferSize((a),(b),(c),(float*)(d),(e),(f))
 56: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnSgeqrf((a),(b),(c),(float*)(d),(e),(float*)(f),(float*)(g),(h),(i))
 57: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnSormqr_bufferSize((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(l))
 58: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnSormqr((a),(b),(c),(d),(e),(f),(float*)(g),(h),(float*)(i),(float*)(j),(k),(float*)(l),(m),(n))
 59: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasStrsm((a),(b),(c),(d),(e),(f),(g),(float*)(h),(float*)(i),(j),(float*)(k),(l))
 60: #else /* real double */
 61: #define cusolverDnXpotrf(a,b,c,d,e,f,g,h)        cusolverDnDpotrf((a),(b),(c),(d),(e),(f),(g),(h))
 62: #define cusolverDnXpotrf_bufferSize(a,b,c,d,e,f) cusolverDnDpotrf_bufferSize((a),(b),(c),(d),(e),(f))
 63: #define cusolverDnXpotrs(a,b,c,d,e,f,g,h,i)      cusolverDnDpotrs((a),(b),(c),(d),(e),(f),(g),(h),(i))
 64: #define cusolverDnXpotri(a,b,c,d,e,f,g,h)        cusolverDnDpotri((a),(b),(c),(d),(e),(f),(g),(h))
 65: #define cusolverDnXpotri_bufferSize(a,b,c,d,e,f) cusolverDnDpotri_bufferSize((a),(b),(c),(d),(e),(f))
 66: #define cusolverDnXsytrf(a,b,c,d,e,f,g,h,i)      cusolverDnDsytrf((a),(b),(c),(d),(e),(f),(g),(h),(i))
 67: #define cusolverDnXsytrf_bufferSize(a,b,c,d,e)   cusolverDnDsytrf_bufferSize((a),(b),(c),(d),(e))
 68: #define cusolverDnXgetrf(a,b,c,d,e,f,g,h)        cusolverDnDgetrf((a),(b),(c),(d),(e),(f),(g),(h))
 69: #define cusolverDnXgetrf_bufferSize(a,b,c,d,e,f) cusolverDnDgetrf_bufferSize((a),(b),(c),(d),(e),(f))
 70: #define cusolverDnXgetrs(a,b,c,d,e,f,g,h,i,j)    cusolverDnDgetrs((a),(b),(c),(d),(e),(f),(g),(h),(i),(j))
 71: #define cusolverDnXgeqrf_bufferSize(a,b,c,d,e,f) cusolverDnDgeqrf_bufferSize((a),(b),(c),(double*)(d),(e),(f))
 72: #define cusolverDnXgeqrf(a,b,c,d,e,f,g,h,i)      cusolverDnDgeqrf((a),(b),(c),(double*)(d),(e),(double*)(f),(double*)(g),(h),(i))
 73: #define cusolverDnXormqr_bufferSize(a,b,c,d,e,f,g,h,i,j,k,l) cusolverDnDormqr_bufferSize((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(l))
 74: #define cusolverDnXormqr(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cusolverDnDormqr((a),(b),(c),(d),(e),(f),(double*)(g),(h),(double*)(i),(double*)(j),(k),(double*)(l),(m),(n))
 75: #define cublasXtrsm(a,b,c,d,e,f,g,h,i,j,k,l)     cublasDtrsm((a),(b),(c),(d),(e),(f),(g),(double*)(h),(double*)(i),(j),(double*)(k),(l))
 76: #endif
 77: #endif

 79: typedef struct {
 80:   PetscScalar *d_v; /* pointer to the matrix on the GPU */
 81:   PetscBool   user_alloc;
 82:   PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
 83:   PetscBool   unplaced_user_alloc;
 84:   /* factorization support */
 85:   PetscCuBLASInt *d_fact_ipiv; /* device pivots */
 86:   PetscScalar *d_fact_tau;  /* device QR tau vector */
 87:   PetscScalar *d_fact_work; /* device workspace */
 88:   PetscCuBLASInt fact_lwork;
 89:   PetscCuBLASInt *d_fact_info; /* device info */
 90:   /* workspace */
 91:   Vec         workvec;
 92: } Mat_SeqDenseCUDA;

 94: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
 95: {
 96:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
 97:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
 98:   PetscErrorCode   ierr;
 99:   PetscBool        iscuda;
100:   cudaError_t      cerr;

103:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&iscuda);
104:   if (!iscuda) return(0);
105:   /* it may happen CPU preallocation has not been performed */
106:   PetscLayoutSetUp(A->rmap);
107:   PetscLayoutSetUp(A->cmap);
108:   if (cA->lda <= 0) cA->lda = A->rmap->n;
109:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
110:   if (!d_data) { /* petsc-allocated storage */
111:     size_t sz;
112:     PetscIntMultError(cA->lda,A->cmap->n,NULL);
113:     sz   = cA->lda*A->cmap->n*sizeof(PetscScalar);
114:     cerr = cudaMalloc((void**)&dA->d_v,sz);CHKERRCUDA(cerr);
115:     cerr = cudaMemset(dA->d_v,0,sz);CHKERRCUDA(cerr);
116:     dA->user_alloc = PETSC_FALSE;
117:   } else { /* user-allocated storage */
118:     dA->d_v        = d_data;
119:     dA->user_alloc = PETSC_TRUE;
120:   }
121:   A->offloadmask  = PETSC_OFFLOAD_GPU;
122:   A->preallocated = PETSC_TRUE;
123:   A->assembled    = PETSC_TRUE;
124:   return(0);
125: }

127: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
128: {
129:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
130:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
131:   PetscErrorCode   ierr;
132:   cudaError_t      cerr;

136:   PetscInfo3(A,"%s matrix %d x %d\n",A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
137:   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
138:     if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
139:       MatSeqDenseSetPreallocation(A,NULL);
140:     }
141:     PetscLogEventBegin(MAT_DenseCopyFromGPU,A,0,0,0);
142:     if (cA->lda > A->rmap->n) {
143:       PetscInt n = A->cmap->n,m = A->rmap->n;

145:       cerr = cudaMemcpy2D(cA->v,cA->lda*sizeof(PetscScalar),dA->d_v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
146:     } else {
147:       cerr = cudaMemcpy(cA->v,dA->d_v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
148:     }
149:     PetscLogGpuToCpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
150:     PetscLogEventEnd(MAT_DenseCopyFromGPU,A,0,0,0);

152:     A->offloadmask = PETSC_OFFLOAD_BOTH;
153:   }
154:   return(0);
155: }

157: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
158: {
159:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
160:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
161:   PetscBool        copy;
162:   PetscErrorCode   ierr;
163:   cudaError_t      cerr;

167:   if (A->boundtocpu) return(0);
168:   copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
169:   PetscInfo3(A,"%s matrix %d x %d\n",copy ? "Copy" : "Reusing",A->rmap->n,A->cmap->n);
170:   if (copy) {
171:     if (!dA->d_v) { /* Allocate GPU memory if not present */
172:       MatSeqDenseCUDASetPreallocation(A,NULL);
173:     }
174:     PetscLogEventBegin(MAT_DenseCopyToGPU,A,0,0,0);
175:     if (cA->lda > A->rmap->n) {
176:       PetscInt n = A->cmap->n,m = A->rmap->n;

178:       cerr = cudaMemcpy2D(dA->d_v,cA->lda*sizeof(PetscScalar),cA->v,cA->lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179:     } else {
180:       cerr = cudaMemcpy(dA->d_v,cA->v,cA->lda*sizeof(PetscScalar)*A->cmap->n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
181:     }
182:     PetscLogCpuToGpu(cA->lda*sizeof(PetscScalar)*A->cmap->n);
183:     PetscLogEventEnd(MAT_DenseCopyToGPU,A,0,0,0);

185:     A->offloadmask = PETSC_OFFLOAD_BOTH;
186:   }
187:   return(0);
188: }

190: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A,Mat B,MatStructure str)
191: {
192:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data,*b = (Mat_SeqDense*)B->data;
193:   PetscErrorCode    ierr;
194:   const PetscScalar *va;
195:   PetscScalar       *vb;
196:   PetscInt          lda1=a->lda,lda2=b->lda,m=A->rmap->n,n=A->cmap->n;
197:   cudaError_t       cerr;

200:   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
201:   if (A->ops->copy != B->ops->copy) {
202:     MatCopy_Basic(A,B,str);
203:     return(0);
204:   }
205:   if (m != B->rmap->n || n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"size(B) != size(A)");
206:   MatDenseCUDAGetArrayRead(A,&va);
207:   MatDenseCUDAGetArrayWrite(B,&vb);
208:   PetscLogGpuTimeBegin();
209:   if (lda1>m || lda2>m) {
210:     cerr = cudaMemcpy2D(vb,lda2*sizeof(PetscScalar),va,lda1*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
211:   } else {
212:     cerr = cudaMemcpy(vb,va,m*(n*sizeof(PetscScalar)),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
213:   }
214:   PetscLogGpuTimeEnd();
215:   MatDenseCUDARestoreArrayWrite(B,&vb);
216:   MatDenseCUDARestoreArrayRead(A,&va);
217:   return(0);
218: }

220: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
221: {
222:   Mat_SeqDense      *a = (Mat_SeqDense*)A->data;
223:   PetscErrorCode    ierr;
224:   PetscScalar       *va;
225:   PetscInt          lda=a->lda,m = A->rmap->n,n = A->cmap->n;
226:   cudaError_t       cerr;

229:   MatDenseCUDAGetArrayWrite(A,&va);
230:   PetscLogGpuTimeBegin();
231:   if (lda>m) {
232:     cerr = cudaMemset2D(va,lda*sizeof(PetscScalar),0,m*sizeof(PetscScalar),n);CHKERRCUDA(cerr);
233:   } else {
234:     cerr = cudaMemset(va,0,m*(n*sizeof(PetscScalar)));CHKERRCUDA(cerr);
235:   }
236:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
237:   PetscLogGpuTimeEnd();
238:   MatDenseCUDARestoreArrayWrite(A,&va);
239:   return(0);
240: }

242: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
243: {
244:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
245:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
246:   PetscErrorCode   ierr;

249:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
250:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
251:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
252:   if (aa->v) { MatSeqDenseCUDACopyToGPU(A); }
253:   dA->unplacedarray = dA->d_v;
254:   dA->unplaced_user_alloc = dA->user_alloc;
255:   dA->d_v = (PetscScalar*)a;
256:   dA->user_alloc = PETSC_TRUE;
257:   return(0);
258: }

260: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
261: {
262:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
263:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
264:   PetscErrorCode   ierr;

267:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
268:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
269:   if (a->v) { MatSeqDenseCUDACopyToGPU(A); }
270:   dA->d_v = dA->unplacedarray;
271:   dA->user_alloc = dA->unplaced_user_alloc;
272:   dA->unplacedarray = NULL;
273:   return(0);
274: }

276: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
277: {
278:   Mat_SeqDense     *aa = (Mat_SeqDense*)A->data;
279:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
280:   cudaError_t      cerr;

283:   if (aa->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
284:   if (aa->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
285:   if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
286:   if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
287:   dA->d_v = (PetscScalar*)a;
288:   dA->user_alloc = PETSC_FALSE;
289:   return(0);
290: }

292: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
293: {
294:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
295:   PetscErrorCode   ierr;

298:   if (!dA->d_v) {
299:     MatSeqDenseCUDASetPreallocation(A,NULL);
300:   }
301:   *a = dA->d_v;
302:   return(0);
303: }

305: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
308:   *a = NULL;
309:   return(0);
310: }

312: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
313: {
314:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
315:   PetscErrorCode   ierr;

318:   MatSeqDenseCUDACopyToGPU(A);
319:   *a   = dA->d_v;
320:   return(0);
321: }

323: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
324: {
326:   *a = NULL;
327:   return(0);
328: }

330: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
331: {
332:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
333:   PetscErrorCode   ierr;

336:   MatSeqDenseCUDACopyToGPU(A);
337:   *a   = dA->d_v;
338:   return(0);
339: }

341: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
342: {
344:   *a = NULL;
345:   return(0);
346: }

348: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
349: {
350: #if PETSC_PKG_CUDA_VERSION_GE(10,1,0)
351:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
352:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
353:   PetscScalar        *da;
354:   PetscErrorCode     ierr;
355:   cudaError_t        ccer;
356:   cusolverStatus_t   cerr;
357:   cusolverDnHandle_t handle;
358:   PetscCuBLASInt     n,lda;
359: #if defined(PETSC_USE_DEBUG)
360:   PetscCuBLASInt     info;
361: #endif

364:   if (!A->rmap->n || !A->cmap->n) return(0);
365:   PetscCUSOLVERDnGetHandle(&handle);
366:   PetscCuBLASIntCast(A->cmap->n,&n);
367:   PetscCuBLASIntCast(a->lda,&lda);
368:   if (A->factortype == MAT_FACTOR_LU) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDngetri not implemented");
369:   else if (A->factortype == MAT_FACTOR_CHOLESKY) {
370:     if (!dA->d_fact_ipiv) { /* spd */
371:       PetscCuBLASInt il;

373:       MatDenseCUDAGetArray(A,&da);
374:       cerr = cusolverDnXpotri_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&il);CHKERRCUSOLVER(cerr);
375:       if (il > dA->fact_lwork) {
376:         dA->fact_lwork = il;

378:         ccer = cudaFree(dA->d_fact_work);CHKERRCUDA(ccer);
379:         ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
380:       }
381:       PetscLogGpuTimeBegin();
382:       cerr = cusolverDnXpotri(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
383:       PetscLogGpuTimeEnd();
384:       MatDenseCUDARestoreArray(A,&da);
385:       /* TODO (write cuda kernel) */
386:       MatSeqDenseSymmetrize_Private(A,PETSC_TRUE);
387:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytri not implemented");
388:   }
389: #if defined(PETSC_USE_DEBUG)
390:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
391:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: leading minor of order %d is zero",info);
392:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
393: #endif
394:   PetscLogGpuFlops(1.0*n*n*n/3.0);
395:   A->ops->solve          = NULL;
396:   A->ops->solvetranspose = NULL;
397:   A->ops->matsolve       = NULL;
398:   A->factortype          = MAT_FACTOR_NONE;

400:   PetscFree(A->solvertype);
401:   return(0);
402: #else
403:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Upgrade to CUDA version 10.1.0 or higher");
404: #endif
405: }

407: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose,
408:                                                      PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
409: {
410:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
411:   PetscScalar      *y;
412:   PetscCuBLASInt   m=0, k=0;
413:   PetscBool        xiscuda, yiscuda, aiscuda;
414:   cudaError_t      cerr;
415:   PetscErrorCode   ierr;

418:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
419:   PetscCuBLASIntCast(A->rmap->n,&m);
420:   PetscCuBLASIntCast(A->cmap->n,&k);
421:   PetscObjectTypeCompare((PetscObject)xx,VECSEQCUDA,&xiscuda);
422:   PetscObjectTypeCompare((PetscObject)yy,VECSEQCUDA,&yiscuda);
423:   {
424:     const PetscScalar *x;
425:     PetscBool xishost = PETSC_TRUE;

427:     /* The logic here is to try to minimize the amount of memory copying:
428:        if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
429:        data is not offloaded to the GPU yet, then the data is copied to the
430:        GPU.  But we are only trying to get the data in order to copy it into the y
431:        array.  So the array x will be wherever the data already is so that
432:        only one memcpy is performed */
433:     if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
434:       VecCUDAGetArrayRead(xx, &x);
435:       xishost =  PETSC_FALSE;
436:     } else {
437:       VecGetArrayRead(xx, &x);
438:     }
439:     if (k < m || !yiscuda) {
440:       if (!dA->workvec) {
441:         VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
442:       }
443:       VecCUDAGetArrayWrite(dA->workvec, &y);
444:     } else {
445:       VecCUDAGetArrayWrite(yy,&y);
446:     }
447:     cerr = cudaMemcpy(y,x,m*sizeof(PetscScalar),xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
448:   }
449:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
450:   if (!aiscuda) {
451:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
452:   }
453:   (*matsolve) (A, y, m, m, 1, k, transpose);
454:   if (!aiscuda) {
455:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
456:   }
457:   if (k < m || !yiscuda) {
458:     PetscScalar *yv;

460:     /* The logic here is that the data is not yet in either yy's GPU array or its
461:        CPU array.  There is nothing in the interface to say where the user would like
462:        it to end up.  So we choose the GPU, because it is the faster option */
463:     if (yiscuda) {
464:       VecCUDAGetArrayWrite(yy,&yv);
465:     } else {
466:       VecGetArray(yy,&yv);
467:     }
468:     cerr = cudaMemcpy(yv,y,k*sizeof(PetscScalar),yiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
469:     if (yiscuda) {
470:       VecCUDARestoreArrayWrite(yy,&yv);
471:     } else {
472:       VecRestoreArray(yy,&yv);
473:     }
474:     VecCUDARestoreArrayWrite(dA->workvec, &y);
475:   } else {
476:     VecCUDARestoreArrayWrite(yy,&y);
477:   }
478:   return(0);
479: }

481: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose,
482:                                                         PetscErrorCode (*matsolve)(Mat,PetscScalar*,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscCuBLASInt,PetscBool))
483: {
484:   PetscScalar       *y;
485:   PetscInt          n, _ldb, _ldx;
486:   PetscBool         biscuda, xiscuda, aiscuda;
487:   PetscCuBLASInt    nrhs=0,m=0,k=0,ldb=0,ldx=0,ldy=0;
488:   cudaError_t       cerr;
489:   PetscErrorCode    ierr;

492:   if (A->factortype == MAT_FACTOR_NONE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix must be factored to solve");
493:   PetscCuBLASIntCast(A->rmap->n,&m);
494:   PetscCuBLASIntCast(A->cmap->n,&k);
495:   MatGetSize(B,NULL,&n);
496:   PetscCuBLASIntCast(n,&nrhs);
497:   MatDenseGetLDA(B,&_ldb);
498:   PetscCuBLASIntCast(_ldb, &ldb);
499:   MatDenseGetLDA(X,&_ldx);
500:   PetscCuBLASIntCast(_ldx, &ldx);

502:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
503:   PetscObjectTypeCompare((PetscObject)X,MATSEQDENSECUDA,&xiscuda);
504:   {
505:     /* The logic here is to try to minimize the amount of memory copying:
506:        if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
507:        data is not offloaded to the GPU yet, then the data is copied to the
508:        GPU.  But we are only trying to get the data in order to copy it into the y
509:        array.  So the array b will be wherever the data already is so that
510:        only one memcpy is performed */
511:     const PetscScalar *b;

513:     /* some copying from B will be involved */
514:     PetscBool bishost = PETSC_TRUE;

516:     if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
517:       MatDenseCUDAGetArrayRead(B,&b);
518:       bishost = PETSC_FALSE;
519:     } else {
520:       MatDenseGetArrayRead(B,&b);
521:     }
522:     if (ldx < m || !xiscuda) {
523:       /* X's array cannot serve as the array (too small or not on device), B's
524:        * array cannot serve as the array (const), so allocate a new array  */
525:       ldy = m;
526:       cerr = cudaMalloc((void**)&y,nrhs*m*sizeof(PetscScalar));CHKERRCUDA(cerr);
527:     } else {
528:       /* X's array should serve as the array */
529:       ldy = ldx;
530:       MatDenseCUDAGetArrayWrite(X,&y);
531:     }
532:     cerr = cudaMemcpy2D(y,ldy*sizeof(PetscScalar),b,ldb*sizeof(PetscScalar),m*sizeof(PetscScalar),nrhs,bishost ? cudaMemcpyHostToDevice: cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
533:     if (bishost) {
534:       MatDenseRestoreArrayRead(B,&b);
535:     } else {
536:       MatDenseCUDARestoreArrayRead(B,&b);
537:     }
538:   }
539:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&aiscuda);
540:   if (!aiscuda) {
541:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
542:   }
543:   (*matsolve) (A, y, ldy, m, nrhs, k, transpose);
544:   if (!aiscuda) {
545:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
546:   }
547:   if (ldx < m || !xiscuda) {
548:     PetscScalar *x;

550:     /* The logic here is that the data is not yet in either X's GPU array or its
551:        CPU array.  There is nothing in the interface to say where the user would like
552:        it to end up.  So we choose the GPU, because it is the faster option */
553:     if (xiscuda) {
554:       MatDenseCUDAGetArrayWrite(X,&x);
555:     } else {
556:       MatDenseGetArray(X,&x);
557:     }
558:     cerr = cudaMemcpy2D(x,ldx*sizeof(PetscScalar),y,ldy*sizeof(PetscScalar),k*sizeof(PetscScalar),nrhs,xiscuda ? cudaMemcpyDeviceToDevice: cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
559:     if (xiscuda) {
560:       MatDenseCUDARestoreArrayWrite(X,&x);
561:     } else {
562:       MatDenseRestoreArray(X,&x);
563:     }
564:     cerr = cudaFree(y);CHKERRCUDA(cerr);
565:   } else {
566:     MatDenseCUDARestoreArrayWrite(X,&y);
567:   }
568:   return(0);
569: }

571: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
572: {
573:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
574:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
575:   const PetscScalar  *da;
576:   PetscCuBLASInt     lda;
577:   cusolverDnHandle_t handle;
578:   cudaError_t        ccer;
579:   cusolverStatus_t   cerr;
580:   int                info;
581:   PetscErrorCode     ierr;

584:   PetscCuBLASIntCast(mat->lda,&lda);
585:   MatDenseCUDAGetArrayRead(A,&da);
586:   PetscCUSOLVERDnGetHandle(&handle);
587:   PetscLogGpuTimeBegin();
588:   PetscInfo2(A,"LU solve %d x %d on backend\n",m,k);
589:   cerr = cusolverDnXgetrs(handle,T ? CUBLAS_OP_T : CUBLAS_OP_N,m,nrhs,da,lda,dA->d_fact_ipiv,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
590:   PetscLogGpuTimeEnd();
591:   MatDenseCUDARestoreArrayRead(A,&da);
592:   if (PetscDefined(USE_DEBUG)) {
593:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
594:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
595:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
596:   }
597:   PetscLogGpuFlops(nrhs*(2.0*m*m - m));
598:   return(0);
599: }

601: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
602: {
603:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
604:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
605:   const PetscScalar  *da;
606:   PetscCuBLASInt     lda;
607:   cusolverDnHandle_t handle;
608:   cudaError_t        ccer;
609:   cusolverStatus_t   cerr;
610:   int                info;
611:   PetscErrorCode     ierr;

614:   PetscCuBLASIntCast(mat->lda,&lda);
615:   MatDenseCUDAGetArrayRead(A,&da);
616:   PetscCUSOLVERDnGetHandle(&handle);
617:   PetscLogGpuTimeBegin();
618:   PetscInfo2(A,"Cholesky solve %d x %d on backend\n",m,k);
619:   if (!dA->d_fact_ipiv) { /* spd */
620:     /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
621:     cerr = cusolverDnXpotrs(handle,CUBLAS_FILL_MODE_LOWER,m,nrhs,da,lda,x,ldx,dA->d_fact_info);CHKERRCUSOLVER(cerr);
622:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"cusolverDnsytrs not implemented");
623:   PetscLogGpuTimeEnd();
624:   MatDenseCUDARestoreArrayRead(A,&da);
625:   if (PetscDefined(USE_DEBUG)) {
626:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
627:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
628:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
629:   }
630:   PetscLogGpuFlops(nrhs*(2.0*m*m - m));
631:   return(0);
632: }

634: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
635: {
636:   Mat_SeqDense       *mat = (Mat_SeqDense*)A->data;
637:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
638:   const PetscScalar  *da;
639:   PetscCuBLASInt     lda, rank;
640:   cusolverDnHandle_t handle;
641:   cublasHandle_t     bhandle;
642:   cudaError_t        ccer;
643:   cusolverStatus_t   csrr;
644:   cublasStatus_t     cbrr;
645:   int                info;
646:   cublasOperation_t  trans;
647:   PetscScalar        one = 1.;
648:   PetscErrorCode     ierr;

651:   PetscCuBLASIntCast(mat->lda,&lda);
652:   PetscCuBLASIntCast(mat->rank,&rank);
653:   MatDenseCUDAGetArrayRead(A,&da);
654:   PetscCUSOLVERDnGetHandle(&handle);
655:   PetscCUBLASGetHandle(&bhandle);
656:   PetscLogGpuTimeBegin();
657:   PetscInfo2(A,"QR solve %d x %d on backend\n",m,k);
658:   if (!T) {
659:     if (PetscDefined(USE_COMPLEX)) {
660:       trans = CUBLAS_OP_C;
661:     } else {
662:       trans = CUBLAS_OP_T;
663:     }
664:     csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
665:     if (PetscDefined(USE_DEBUG)) {
666:       ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
667:       if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
668:     }
669:     cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
670:   } else {
671:     cbrr = cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);CHKERRCUBLAS(cbrr);
672:     csrr = cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);CHKERRCUSOLVER(csrr);
673:     if (PetscDefined(USE_DEBUG)) {
674:       ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
675:       if (info != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
676:     }
677:   }
678:   PetscLogGpuTimeEnd();
679:   MatDenseCUDARestoreArrayRead(A,&da);
680:   PetscLogFlops(nrhs*(4.0*m*mat->rank - PetscSqr(mat->rank)));
681:   return(0);
682: }

684: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
685: {

689:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
690:   return(0);
691: }

693: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A,Vec xx,Vec yy)
694: {

698:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
699:   return(0);
700: }

702: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
703: {

707:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
708:   return(0);
709: }

711: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Vec xx,Vec yy)
712: {

716:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
717:   return(0);
718: }

720: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
721: {

725:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
726:   return(0);
727: }

729: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A,Vec xx,Vec yy)
730: {

734:   MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
735:   return(0);
736: }

738: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
739: {

743:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
744:   return(0);
745: }

747: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A,Mat B,Mat X)
748: {

752:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
753:   return(0);
754: }

756: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
757: {

761:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
762:   return(0);
763: }

765: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A,Mat B,Mat X)
766: {

770:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
771:   return(0);
772: }

774: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
775: {

779:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
780:   return(0);
781: }

783: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A,Mat B,Mat X)
784: {

788:   MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
789:   return(0);
790: }

792: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A,IS rperm,IS cperm,const MatFactorInfo *factinfo)
793: {
794:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
795:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
796:   PetscScalar        *da;
797:   PetscCuBLASInt     m,n,lda;
798: #if defined(PETSC_USE_DEBUG)
799:   int                info;
800: #endif
801:   cusolverStatus_t   cerr;
802:   cusolverDnHandle_t handle;
803:   cudaError_t        ccer;
804:   PetscErrorCode     ierr;

807:   if (!A->rmap->n || !A->cmap->n) return(0);
808:   PetscCUSOLVERDnGetHandle(&handle);
809:   MatDenseCUDAGetArray(A,&da);
810:   PetscCuBLASIntCast(A->cmap->n,&n);
811:   PetscCuBLASIntCast(A->rmap->n,&m);
812:   PetscCuBLASIntCast(a->lda,&lda);
813:   PetscInfo2(A,"LU factor %d x %d on backend\n",m,n);
814:   if (!dA->d_fact_ipiv) {
815:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
816:   }
817:   if (!dA->fact_lwork) {
818:     cerr = cusolverDnXgetrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
819:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
820:   }
821:   if (!dA->d_fact_info) {
822:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
823:   }
824:   PetscLogGpuTimeBegin();
825:   cerr = cusolverDnXgetrf(handle,m,n,da,lda,dA->d_fact_work,dA->d_fact_ipiv,dA->d_fact_info);CHKERRCUSOLVER(cerr);
826:   PetscLogGpuTimeEnd();
827:   MatDenseCUDARestoreArray(A,&da);
828: #if defined(PETSC_USE_DEBUG)
829:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
830:   if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
831:   else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
832: #endif
833:   A->factortype = MAT_FACTOR_LU;
834:   PetscLogGpuFlops(2.0*n*n*m/3.0);

836:   A->ops->solve             = MatSolve_SeqDenseCUDA_LU;
837:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_LU;
838:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_LU;
839:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;

841:   PetscFree(A->solvertype);
842:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
843:   return(0);
844: }

846: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A,IS perm,const MatFactorInfo *factinfo)
847: {
848:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
849:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
850:   PetscScalar        *da;
851:   PetscCuBLASInt     n,lda;
852: #if defined(PETSC_USE_DEBUG)
853:   int                info;
854: #endif
855:   cusolverStatus_t   cerr;
856:   cusolverDnHandle_t handle;
857:   cudaError_t        ccer;
858:   PetscErrorCode     ierr;

861:   if (!A->rmap->n || !A->cmap->n) return(0);
862:   PetscCUSOLVERDnGetHandle(&handle);
863:   PetscCuBLASIntCast(A->rmap->n,&n);
864:   PetscInfo2(A,"Cholesky factor %d x %d on backend\n",n,n);
865:   if (A->spd) {
866:     MatDenseCUDAGetArray(A,&da);
867:     PetscCuBLASIntCast(a->lda,&lda);
868:     if (!dA->fact_lwork) {
869:       cerr = cusolverDnXpotrf_bufferSize(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
870:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
871:     }
872:     if (!dA->d_fact_info) {
873:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
874:     }
875:     PetscLogGpuTimeBegin();
876:     cerr = cusolverDnXpotrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
877:     PetscLogGpuTimeEnd();

879:     MatDenseCUDARestoreArray(A,&da);
880: #if defined(PETSC_USE_DEBUG)
881:     ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
882:     if (info > 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Bad factorization: zero pivot in row %d",info-1);
883:     else if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
884: #endif
885:     A->factortype = MAT_FACTOR_CHOLESKY;
886:     PetscLogGpuFlops(1.0*n*n*n/3.0);
887:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
888: #if 0
889:     /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
890:        The code below should work, and it can be activated when *sytrs routines will be available */
891:     if (!dA->d_fact_ipiv) {
892:       ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
893:     }
894:     if (!dA->fact_lwork) {
895:       cerr = cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
896:       ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
897:     }
898:     if (!dA->d_fact_info) {
899:       ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
900:     }
901:     PetscLogGpuTimeBegin();
902:     cerr = cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
903:     PetscLogGpuTimeEnd();
904: #endif

906:   A->ops->solve             = MatSolve_SeqDenseCUDA_Cholesky;
907:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_Cholesky;
908:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_Cholesky;
909:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
910:   PetscFree(A->solvertype);
911:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
912:   return(0);
913: }

915: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A,IS col,const MatFactorInfo *factinfo)
916: {
917:   Mat_SeqDense       *a = (Mat_SeqDense*)A->data;
918:   Mat_SeqDenseCUDA   *dA = (Mat_SeqDenseCUDA*)A->spptr;
919:   PetscScalar        *da;
920:   PetscCuBLASInt     m,min,max,n,lda;
921: #if defined(PETSC_USE_DEBUG)
922:   int                info;
923: #endif
924:   cusolverStatus_t   cerr;
925:   cusolverDnHandle_t handle;
926:   cudaError_t        ccer;
927:   PetscErrorCode     ierr;

930:   if (!A->rmap->n || !A->cmap->n) return(0);
931:   PetscCUSOLVERDnGetHandle(&handle);
932:   MatDenseCUDAGetArray(A,&da);
933:   PetscCuBLASIntCast(A->cmap->n,&n);
934:   PetscCuBLASIntCast(A->rmap->n,&m);
935:   PetscCuBLASIntCast(a->lda,&lda);
936:   PetscInfo2(A,"QR factor %d x %d on backend\n",m,n);
937:   max = PetscMax(m,n);
938:   min = PetscMin(m,n);
939:   if (!dA->d_fact_tau) {
940:     ccer = cudaMalloc((void**)&dA->d_fact_tau,min*sizeof(*dA->d_fact_tau));CHKERRCUDA(ccer);
941:   }
942:   if (!dA->d_fact_ipiv) {
943:     ccer = cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));CHKERRCUDA(ccer);
944:   }
945:   if (!dA->fact_lwork) {
946:     cerr = cusolverDnXgeqrf_bufferSize(handle,m,n,da,lda,&dA->fact_lwork);CHKERRCUSOLVER(cerr);
947:     ccer = cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));CHKERRCUDA(ccer);
948:   }
949:   if (!dA->d_fact_info) {
950:     ccer = cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));CHKERRCUDA(ccer);
951:   }
952:   if (!dA->workvec) {
953:     VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
954:   }
955:   PetscLogGpuTimeBegin();
956:   cerr = cusolverDnXgeqrf(handle,m,n,da,lda,dA->d_fact_tau,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);CHKERRCUSOLVER(cerr);
957:   PetscLogGpuTimeEnd();
958:   MatDenseCUDARestoreArray(A,&da);
959: #if defined(PETSC_USE_DEBUG)
960:   ccer = cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);CHKERRCUDA(ccer);
961:   if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Wrong argument to cuSolver %d",-info);
962: #endif
963:   A->factortype = MAT_FACTOR_QR;
964:   a->rank = min;
965:   PetscLogGpuFlops(2.0*min*min*(max-min/3.0));

967:   A->ops->solve             = MatSolve_SeqDenseCUDA_QR;
968:   A->ops->solvetranspose    = MatSolveTranspose_SeqDenseCUDA_QR;
969:   A->ops->matsolve          = MatMatSolve_SeqDenseCUDA_QR;
970:   A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;

972:   PetscFree(A->solvertype);
973:   PetscStrallocpy(MATSOLVERCUDA,&A->solvertype);
974:   return(0);
975: }

977: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
978: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A,Mat B,Mat C,PetscBool tA,PetscBool tB)
979: {
980:   const PetscScalar *da,*db;
981:   PetscScalar       *dc;
982:   PetscScalar       one=1.0,zero=0.0;
983:   PetscCuBLASInt    m,n,k;
984:   PetscInt          alda,blda,clda;
985:   PetscErrorCode    ierr;
986:   cublasHandle_t    cublasv2handle;
987:   PetscBool         Aiscuda,Biscuda;
988:   cublasStatus_t    berr;

991:   /* we may end up with SEQDENSE as one of the arguments */
992:   PetscObjectTypeCompare((PetscObject)A,MATSEQDENSECUDA,&Aiscuda);
993:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&Biscuda);
994:   if (!Aiscuda) {
995:     MatConvert(A,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&A);
996:   }
997:   if (!Biscuda) {
998:     MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
999:   }
1000:   PetscCuBLASIntCast(C->rmap->n,&m);
1001:   PetscCuBLASIntCast(C->cmap->n,&n);
1002:   if (tA) {
1003:     PetscCuBLASIntCast(A->rmap->n,&k);
1004:   } else {
1005:     PetscCuBLASIntCast(A->cmap->n,&k);
1006:   }
1007:   if (!m || !n || !k) return(0);
1008:   PetscInfo3(C,"Matrix-Matrix product %d x %d x %d on backend\n",m,k,n);
1009:   MatDenseCUDAGetArrayRead(A,&da);
1010:   MatDenseCUDAGetArrayRead(B,&db);
1011:   MatDenseCUDAGetArrayWrite(C,&dc);
1012:   MatDenseGetLDA(A,&alda);
1013:   MatDenseGetLDA(B,&blda);
1014:   MatDenseGetLDA(C,&clda);
1015:   PetscCUBLASGetHandle(&cublasv2handle);
1016:   PetscLogGpuTimeBegin();
1017:   berr = cublasXgemm(cublasv2handle,tA ? CUBLAS_OP_T : CUBLAS_OP_N,tB ? CUBLAS_OP_T : CUBLAS_OP_N,
1018:                      m,n,k,&one,da,alda,db,blda,&zero,dc,clda);CHKERRCUBLAS(berr);
1019:   PetscLogGpuTimeEnd();
1020:   PetscLogGpuFlops(1.0*m*n*k + 1.0*m*n*(k-1));
1021:   MatDenseCUDARestoreArrayRead(A,&da);
1022:   MatDenseCUDARestoreArrayRead(B,&db);
1023:   MatDenseCUDARestoreArrayWrite(C,&dc);
1024:   if (!Aiscuda) {
1025:     MatConvert(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1026:   }
1027:   if (!Biscuda) {
1028:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1029:   }
1030:   return(0);
1031: }

1033: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1034: {

1038:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_TRUE,PETSC_FALSE);
1039:   return(0);
1040: }

1042: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1043: {

1047:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_FALSE);
1048:   return(0);
1049: }

1051: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A,Mat B,Mat C)
1052: {

1056:   MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A,B,C,PETSC_FALSE,PETSC_TRUE);
1057:   return(0);
1058: }

1060: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
1061: {

1065:   MatProductSetFromOptions_SeqDense(C);
1066:   return(0);
1067: }

1069: /* zz = op(A)*xx + yy
1070:    if yy == NULL, only MatMult */
1071: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans)
1072: {
1073:   Mat_SeqDense      *mat = (Mat_SeqDense*)A->data;
1074:   const PetscScalar *xarray,*da;
1075:   PetscScalar       *zarray;
1076:   PetscScalar       one=1.0,zero=0.0;
1077:   PetscCuBLASInt    m, n, lda;
1078:   cublasHandle_t    cublasv2handle;
1079:   cublasStatus_t    berr;
1080:   PetscErrorCode    ierr;

1083:   if (yy && yy != zz) { /* mult add */
1084:     VecCopy_SeqCUDA(yy,zz);
1085:   }
1086:   if (!A->rmap->n || !A->cmap->n) {
1087:     if (!yy) { /* mult only */
1088:       VecSet_SeqCUDA(zz,0.0);
1089:     }
1090:     return(0);
1091:   }
1092:   PetscInfo2(A,"Matrix-vector product %d x %d on backend\n",A->rmap->n,A->cmap->n);
1093:   PetscCuBLASIntCast(A->rmap->n,&m);
1094:   PetscCuBLASIntCast(A->cmap->n,&n);
1095:   PetscCuBLASIntCast(mat->lda,&lda);
1096:   PetscCUBLASGetHandle(&cublasv2handle);
1097:   MatDenseCUDAGetArrayRead(A,&da);
1098:   VecCUDAGetArrayRead(xx,&xarray);
1099:   VecCUDAGetArray(zz,&zarray);
1100:   PetscLogGpuTimeBegin();
1101:   berr = cublasXgemv(cublasv2handle,trans ? CUBLAS_OP_T : CUBLAS_OP_N,
1102:                      m,n,&one,da,lda,xarray,1,(yy ? &one : &zero),zarray,1);CHKERRCUBLAS(berr);
1103:   PetscLogGpuTimeEnd();
1104:   PetscLogGpuFlops(2.0*A->rmap->n*A->cmap->n - (yy ? 0 : A->rmap->n));
1105:   VecCUDARestoreArrayRead(xx,&xarray);
1106:   VecCUDARestoreArray(zz,&zarray);
1107:   MatDenseCUDARestoreArrayRead(A,&da);
1108:   return(0);
1109: }

1111: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1112: {

1116:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_FALSE);
1117:   return(0);
1118: }

1120: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A,Vec xx,Vec yy,Vec zz)
1121: {

1125:   MatMultAdd_SeqDenseCUDA_Private(A,xx,yy,zz,PETSC_TRUE);
1126:   return(0);
1127: }

1129: PetscErrorCode MatMult_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1130: {

1134:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_FALSE);
1135:   return(0);
1136: }

1138: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A,Vec xx,Vec yy)
1139: {

1143:   MatMultAdd_SeqDenseCUDA_Private(A,xx,NULL,yy,PETSC_TRUE);
1144:   return(0);
1145: }

1147: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A,const PetscScalar **array)
1148: {
1149:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1153:   MatSeqDenseCUDACopyFromGPU(A);
1154:   *array = mat->v;
1155:   return(0);
1156: }

1158: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A,PetscScalar **array)
1159: {
1160:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1164:   if (!mat->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
1165:     MatSeqDenseSetPreallocation(A,NULL);
1166:   }
1167:   *array = mat->v;
1168:   A->offloadmask = PETSC_OFFLOAD_CPU;
1169:   return(0);
1170: }

1172: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A,PetscScalar **array)
1173: {
1174:   Mat_SeqDense   *mat = (Mat_SeqDense*)A->data;

1178:   MatSeqDenseCUDACopyFromGPU(A);
1179:   *array = mat->v;
1180:   A->offloadmask = PETSC_OFFLOAD_CPU;
1181:   return(0);
1182: }

1184: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y,PetscScalar alpha)
1185: {
1186:   Mat_SeqDense   *y = (Mat_SeqDense*)Y->data;
1187:   PetscScalar    *dy;
1188:   PetscCuBLASInt j,N,m,lday,one = 1;
1189:   cublasHandle_t cublasv2handle;
1190:   cublasStatus_t berr;

1194:   PetscCUBLASGetHandle(&cublasv2handle);
1195:   MatDenseCUDAGetArray(Y,&dy);
1196:   PetscCuBLASIntCast(Y->rmap->n*Y->cmap->n,&N);
1197:   PetscCuBLASIntCast(Y->rmap->n,&m);
1198:   PetscCuBLASIntCast(y->lda,&lday);
1199:   PetscInfo2(Y,"Performing Scale %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1200:   PetscLogGpuTimeBegin();
1201:   if (lday>m) {
1202:     for (j=0; j<Y->cmap->n; j++) {
1203:       berr = cublasXscal(cublasv2handle,m,&alpha,dy+lday*j,one);CHKERRCUBLAS(berr);
1204:     }
1205:   } else {
1206:     berr = cublasXscal(cublasv2handle,N,&alpha,dy,one);CHKERRCUBLAS(berr);
1207:   }
1208:   PetscLogGpuTimeEnd();
1209:   PetscLogGpuFlops(N);
1210:   MatDenseCUDARestoreArray(Y,&dy);
1211:   return(0);
1212: }

1214: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y,PetscScalar alpha,Mat X,MatStructure str)
1215: {
1216:   Mat_SeqDense      *x = (Mat_SeqDense*)X->data;
1217:   Mat_SeqDense      *y = (Mat_SeqDense*)Y->data;
1218:   const PetscScalar *dx;
1219:   PetscScalar       *dy;
1220:   PetscCuBLASInt    j,N,m,ldax,lday,one = 1;
1221:   cublasHandle_t    cublasv2handle;
1222:   cublasStatus_t    berr;
1223:   PetscErrorCode    ierr;

1226:   if (!X->rmap->n || !X->cmap->n) return(0);
1227:   PetscCUBLASGetHandle(&cublasv2handle);
1228:   MatDenseCUDAGetArrayRead(X,&dx);
1229:   if (alpha != 0.0) {
1230:     MatDenseCUDAGetArray(Y,&dy);
1231:   } else {
1232:     MatDenseCUDAGetArrayWrite(Y,&dy);
1233:   }
1234:   PetscCuBLASIntCast(X->rmap->n*X->cmap->n,&N);
1235:   PetscCuBLASIntCast(X->rmap->n,&m);
1236:   PetscCuBLASIntCast(x->lda,&ldax);
1237:   PetscCuBLASIntCast(y->lda,&lday);
1238:   PetscInfo2(Y,"Performing AXPY %d x %d on backend\n",Y->rmap->n,Y->cmap->n);
1239:   PetscLogGpuTimeBegin();
1240:   if (ldax>m || lday>m) {
1241:     for (j=0; j<X->cmap->n; j++) {
1242:       berr = cublasXaxpy(cublasv2handle,m,&alpha,dx+j*ldax,one,dy+j*lday,one);CHKERRCUBLAS(berr);
1243:     }
1244:   } else {
1245:     berr = cublasXaxpy(cublasv2handle,N,&alpha,dx,one,dy,one);CHKERRCUBLAS(berr);
1246:   }
1247:   PetscLogGpuTimeEnd();
1248:   PetscLogGpuFlops(PetscMax(2.*N-1,0));
1249:   MatDenseCUDARestoreArrayRead(X,&dx);
1250:   if (alpha != 0.0) {
1251:     MatDenseCUDARestoreArray(Y,&dy);
1252:   } else {
1253:     MatDenseCUDARestoreArrayWrite(Y,&dy);
1254:   }
1255:   return(0);
1256: }

1258: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1259: {
1260:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1261:   cudaError_t      cerr;
1262:   PetscErrorCode   ierr;

1265:   if (dA) {
1266:     if (dA->unplacedarray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"MatDenseCUDAResetArray() must be called first");
1267:     if (!dA->user_alloc) { cerr = cudaFree(dA->d_v);CHKERRCUDA(cerr); }
1268:     cerr = cudaFree(dA->d_fact_tau);CHKERRCUDA(cerr);
1269:     cerr = cudaFree(dA->d_fact_ipiv);CHKERRCUDA(cerr);
1270:     cerr = cudaFree(dA->d_fact_info);CHKERRCUDA(cerr);
1271:     cerr = cudaFree(dA->d_fact_work);CHKERRCUDA(cerr);
1272:     VecDestroy(&dA->workvec);
1273:   }
1274:   PetscFree(A->spptr);
1275:   return(0);
1276: }

1278: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1279: {
1280:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1284:   /* prevent to copy back data if we own the data pointer */
1285:   if (!a->user_alloc) { A->offloadmask = PETSC_OFFLOAD_CPU; }
1286:   MatConvert_SeqDenseCUDA_SeqDense(A,MATSEQDENSE,MAT_INPLACE_MATRIX,&A);
1287:   MatDestroy_SeqDense(A);
1288:   return(0);
1289: }

1291: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A,MatDuplicateOption cpvalues,Mat *B)
1292: {
1293:   PetscErrorCode     ierr;
1294:   MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;

1297:   MatCreate(PetscObjectComm((PetscObject)A),B);
1298:   MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1299:   MatSetType(*B,((PetscObject)A)->type_name);
1300:   MatDuplicateNoCreate_SeqDense(*B,A,hcpvalues);
1301:   if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) {
1302:     MatCopy_SeqDenseCUDA(A,*B,SAME_NONZERO_PATTERN);
1303:   }
1304:   return(0);
1305: }

1307: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A,Vec v,PetscInt col)
1308: {
1309:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1310:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1311:   PetscErrorCode   ierr;
1312:   PetscScalar      *x;
1313:   PetscBool        viscuda;
1314:   cudaError_t      cerr;

1317:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,VECCUDA,"");
1318:   if (viscuda && !v->boundtocpu) { /* update device data */
1319:     VecCUDAGetArrayWrite(v,&x);
1320:     if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1321:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToHost);CHKERRCUDA(cerr);
1322:     } else {
1323:       cerr = cudaMemcpy(x,a->v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1324:     }
1325:     VecCUDARestoreArrayWrite(v,&x);
1326:   } else { /* update host data */
1327:     VecGetArrayWrite(v,&x);
1328:     if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1329:       PetscArraycpy(x,a->v+col*a->lda,A->rmap->n);
1330:     } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1331:       cerr = cudaMemcpy(x,dA->d_v + col*a->lda,A->rmap->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1332:     }
1333:     VecRestoreArrayWrite(v,&x);
1334:   }
1335:   return(0);
1336: }

1338: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A,MatFactorType ftype,Mat *fact)
1339: {

1343:   MatCreate(PetscObjectComm((PetscObject)A),fact);
1344:   MatSetSizes(*fact,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);
1345:   MatSetType(*fact,MATSEQDENSECUDA);
1346:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1347:     (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1348:     (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1349:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1350:     (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1351:   } else if (ftype == MAT_FACTOR_QR) {
1352:     PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactor_C",MatQRFactor_SeqDense);
1353:     PetscObjectComposeFunction((PetscObject)(*fact),"MatQRFactorSymbolic_C",MatQRFactorSymbolic_SeqDense);
1354:   }
1355:   (*fact)->factortype = ftype;
1356:   PetscFree((*fact)->solvertype);
1357:   PetscStrallocpy(MATSOLVERCUDA,&(*fact)->solvertype);
1358:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1359:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1360:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1361:   PetscStrallocpy(MATORDERINGEXTERNAL,(char**)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1362:   return(0);
1363: }

1365: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1366: {
1367:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1371:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1372:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1373:   MatDenseCUDAGetArray(A,(PetscScalar**)&a->ptrinuse);
1374:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1375:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1376:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1377:   }
1378:   a->vecinuse = col + 1;
1379:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1380:   *v   = a->cvec;
1381:   return(0);
1382: }

1384: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1385: {
1386:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1390:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1391:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1392:   a->vecinuse = 0;
1393:   VecCUDAResetArray(a->cvec);
1394:   MatDenseCUDARestoreArray(A,(PetscScalar**)&a->ptrinuse);
1395:   *v   = NULL;
1396:   return(0);
1397: }

1399: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1400: {
1401:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1405:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1406:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1407:   MatDenseCUDAGetArrayRead(A,&a->ptrinuse);
1408:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1409:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1410:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1411:   }
1412:   a->vecinuse = col + 1;
1413:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1414:   VecLockReadPush(a->cvec);
1415:   *v   = a->cvec;
1416:   return(0);
1417: }

1419: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1420: {
1421:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1425:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1426:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1427:   a->vecinuse = 0;
1428:   VecLockReadPop(a->cvec);
1429:   VecCUDAResetArray(a->cvec);
1430:   MatDenseCUDARestoreArrayRead(A,&a->ptrinuse);
1431:   *v   = NULL;
1432:   return(0);
1433: }

1435: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1436: {
1437:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1441:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1442:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1443:   MatDenseCUDAGetArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1444:   if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1445:     VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A),A->rmap->bs,A->rmap->n,a->ptrinuse,&a->cvec);
1446:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cvec);
1447:   }
1448:   a->vecinuse = col + 1;
1449:   VecCUDAPlaceArray(a->cvec,a->ptrinuse + (size_t)col * (size_t)a->lda);
1450:   *v   = a->cvec;
1451:   return(0);
1452: }

1454: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A,PetscInt col,Vec *v)
1455: {
1456:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1460:   if (!a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetColumnVec() first");
1461:   if (!a->cvec) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column vector");
1462:   a->vecinuse = 0;
1463:   VecCUDAResetArray(a->cvec);
1464:   MatDenseCUDARestoreArrayWrite(A,(PetscScalar**)&a->ptrinuse);
1465:   *v   = NULL;
1466:   return(0);
1467: }

1469: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A,PetscInt cbegin,PetscInt cend,Mat *v)
1470: {
1471:   Mat_SeqDense     *a = (Mat_SeqDense*)A->data;
1472:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1473:   PetscErrorCode   ierr;

1476:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1477:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1478:   if (a->cmat && cend-cbegin != a->cmat->cmap->N) {
1479:     MatDestroy(&a->cmat);
1480:   }
1481:   MatSeqDenseCUDACopyToGPU(A);
1482:   if (!a->cmat) {
1483:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)A),A->rmap->n,PETSC_DECIDE,A->rmap->N,cend-cbegin,dA->d_v + (size_t)cbegin * (size_t)a->lda,&a->cmat);
1484:     PetscLogObjectParent((PetscObject)A,(PetscObject)a->cmat);
1485:   } else {
1486:     MatDenseCUDAPlaceArray(a->cmat,dA->d_v + (size_t)cbegin * (size_t)a->lda);
1487:   }
1488:   MatDenseSetLDA(a->cmat,a->lda);
1489:   if (a->v) { MatDensePlaceArray(a->cmat,a->v + (size_t)cbegin * (size_t)a->lda); }
1490:   a->cmat->offloadmask = A->offloadmask;
1491:   a->matinuse = cbegin + 1;
1492:   *v = a->cmat;
1493:   return(0);
1494: }

1496: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A,Mat *v)
1497: {
1498:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1502:   if (!a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseGetSubMatrix() first");
1503:   if (!a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing internal column matrix");
1504:   if (*v != a->cmat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Not the matrix obtained from MatDenseGetSubMatrix()");
1505:   a->matinuse = 0;
1506:   A->offloadmask = PETSC_OFFLOAD_GPU;
1507:   MatDenseCUDAResetArray(a->cmat);
1508:   MatDenseResetArray(a->cmat);
1509:   *v = NULL;
1510:   return(0);
1511: }

1513: static PetscErrorCode  MatDenseSetLDA_SeqDenseCUDA(Mat A,PetscInt lda)
1514: {
1515:   Mat_SeqDense     *cA = (Mat_SeqDense*)A->data;
1516:   Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA*)A->spptr;
1517:   PetscBool        data;

1520:   data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1521:   if (!dA->user_alloc && data && cA->lda!=lda) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"LDA cannot be changed after allocation of internal storage");
1522:   if (lda < A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"LDA %D must be at least matrix dimension %D",lda,A->rmap->n);
1523:   cA->lda = lda;
1524:   return(0);
1525: }

1527: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A,PetscBool flg)
1528: {
1529:   Mat_SeqDense   *a = (Mat_SeqDense*)A->data;

1533:   if (a->vecinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreColumnVec() first");
1534:   if (a->matinuse) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Need to call MatDenseRestoreSubMatrix() first");
1535:   A->boundtocpu = flg;
1536:   if (!flg) {
1537:     PetscBool iscuda;

1539:     PetscObjectTypeCompare((PetscObject)a->cvec,VECSEQCUDA,&iscuda);
1540:     if (!iscuda) {
1541:       VecDestroy(&a->cvec);
1542:     }
1543:     PetscObjectTypeCompare((PetscObject)a->cmat,MATSEQDENSECUDA,&iscuda);
1544:     if (!iscuda) {
1545:       MatDestroy(&a->cmat);
1546:     }
1547:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDenseCUDA);
1548:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArrayRead_SeqDenseCUDA);
1549:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArrayWrite_SeqDenseCUDA);
1550:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDenseCUDA);
1551:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDenseCUDA);
1552:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDenseCUDA);
1553:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1554:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDenseCUDA);
1555:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1556:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDenseCUDA);
1557:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDenseCUDA);
1558:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDenseCUDA);
1559:     PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDenseCUDA);

1561:     A->ops->duplicate               = MatDuplicate_SeqDenseCUDA;
1562:     A->ops->mult                    = MatMult_SeqDenseCUDA;
1563:     A->ops->multadd                 = MatMultAdd_SeqDenseCUDA;
1564:     A->ops->multtranspose           = MatMultTranspose_SeqDenseCUDA;
1565:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDenseCUDA;
1566:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1567:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1568:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1569:     A->ops->axpy                    = MatAXPY_SeqDenseCUDA;
1570:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDenseCUDA;
1571:     A->ops->lufactor                = MatLUFactor_SeqDenseCUDA;
1572:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDenseCUDA;
1573:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDenseCUDA;
1574:     A->ops->scale                   = MatScale_SeqDenseCUDA;
1575:     A->ops->copy                    = MatCopy_SeqDenseCUDA;
1576:     A->ops->zeroentries             = MatZeroEntries_SeqDenseCUDA;
1577:   } else {
1578:     /* make sure we have an up-to-date copy on the CPU */
1579:     MatSeqDenseCUDACopyFromGPU(A);
1580:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArray_C",MatDenseGetArray_SeqDense);
1581:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayRead_C",MatDenseGetArray_SeqDense);
1582:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetArrayWrite_C",MatDenseGetArray_SeqDense);
1583:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVec_C",MatDenseGetColumnVec_SeqDense);
1584:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVec_C",MatDenseRestoreColumnVec_SeqDense);
1585:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecRead_C",MatDenseGetColumnVecRead_SeqDense);
1586:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecRead_C",MatDenseRestoreColumnVecRead_SeqDense);
1587:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetColumnVecWrite_C",MatDenseGetColumnVecWrite_SeqDense);
1588:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreColumnVecWrite_C",MatDenseRestoreColumnVecWrite_SeqDense);
1589:     PetscObjectComposeFunction((PetscObject)A,"MatDenseGetSubMatrix_C",MatDenseGetSubMatrix_SeqDense);
1590:     PetscObjectComposeFunction((PetscObject)A,"MatDenseRestoreSubMatrix_C",MatDenseRestoreSubMatrix_SeqDense);
1591:     PetscObjectComposeFunction((PetscObject)A,"MatDenseSetLDA_C",MatDenseSetLDA_SeqDense);
1592:     PetscObjectComposeFunction((PetscObject)A,"MatQRFactor_C",MatQRFactor_SeqDense);

1594:     A->ops->duplicate               = MatDuplicate_SeqDense;
1595:     A->ops->mult                    = MatMult_SeqDense;
1596:     A->ops->multadd                 = MatMultAdd_SeqDense;
1597:     A->ops->multtranspose           = MatMultTranspose_SeqDense;
1598:     A->ops->multtransposeadd        = MatMultTransposeAdd_SeqDense;
1599:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1600:     A->ops->matmultnumeric          = MatMatMultNumeric_SeqDense_SeqDense;
1601:     A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1602:     A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1603:     A->ops->axpy                    = MatAXPY_SeqDense;
1604:     A->ops->choleskyfactor          = MatCholeskyFactor_SeqDense;
1605:     A->ops->lufactor                = MatLUFactor_SeqDense;
1606:     A->ops->productsetfromoptions   = MatProductSetFromOptions_SeqDense;
1607:     A->ops->getcolumnvector         = MatGetColumnVector_SeqDense;
1608:     A->ops->scale                   = MatScale_SeqDense;
1609:     A->ops->copy                    = MatCopy_SeqDense;
1610:     A->ops->zeroentries             = MatZeroEntries_SeqDense;
1611:   }
1612:   if (a->cmat) {
1613:     MatBindToCPU(a->cmat,flg);
1614:   }
1615:   return(0);
1616: }

1618: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1619: {
1620:   Mat              B;
1621:   PetscErrorCode   ierr;

1624:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1625:     /* TODO these cases should be optimized */
1626:     MatConvert_Basic(M,type,reuse,newmat);
1627:     return(0);
1628:   }

1630:   B    = *newmat;
1631:   MatBindToCPU_SeqDenseCUDA(B,PETSC_TRUE);
1632:   MatReset_SeqDenseCUDA(B);
1633:   PetscFree(B->defaultvectype);
1634:   PetscStrallocpy(VECSTANDARD,&B->defaultvectype);
1635:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSE);
1636:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",NULL);
1637:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",NULL);
1638:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",NULL);
1639:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",NULL);
1640:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",NULL);
1641:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",NULL);
1642:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",NULL);
1643:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",NULL);
1644:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",NULL);
1645:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",NULL);
1646:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",NULL);

1648:   B->ops->bindtocpu = NULL;
1649:   B->ops->destroy = MatDestroy_SeqDense;
1650:   B->offloadmask = PETSC_OFFLOAD_CPU;
1651:   return(0);
1652: }

1654: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M,MatType type,MatReuse reuse,Mat *newmat)
1655: {
1656:   Mat_SeqDenseCUDA *dB;
1657:   Mat              B;
1658:   PetscErrorCode   ierr;

1661:   PetscCUDAInitializeCheck();
1662:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1663:     /* TODO these cases should be optimized */
1664:     MatConvert_Basic(M,type,reuse,newmat);
1665:     return(0);
1666:   }

1668:   B    = *newmat;
1669:   PetscFree(B->defaultvectype);
1670:   PetscStrallocpy(VECCUDA,&B->defaultvectype);
1671:   PetscObjectChangeTypeName((PetscObject)B,MATSEQDENSECUDA);
1672:   PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqdensecuda_seqdense_C",            MatConvert_SeqDenseCUDA_SeqDense);
1673:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArray_C",                        MatDenseCUDAGetArray_SeqDenseCUDA);
1674:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayRead_C",                    MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1675:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAGetArrayWrite_C",                   MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1676:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArray_C",                    MatDenseCUDARestoreArray_SeqDenseCUDA);
1677:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayRead_C",                MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1678:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDARestoreArrayWrite_C",               MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1679:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAPlaceArray_C",                      MatDenseCUDAPlaceArray_SeqDenseCUDA);
1680:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAResetArray_C",                      MatDenseCUDAResetArray_SeqDenseCUDA);
1681:   PetscObjectComposeFunction((PetscObject)B,"MatDenseCUDAReplaceArray_C",                    MatDenseCUDAReplaceArray_SeqDenseCUDA);
1682:   PetscObjectComposeFunction((PetscObject)B,"MatProductSetFromOptions_seqaij_seqdensecuda_C",MatProductSetFromOptions_SeqAIJ_SeqDense);

1684:   PetscNewLog(B,&dB);
1685:   B->spptr = dB;

1687:   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

1689:   MatBindToCPU_SeqDenseCUDA(B,PETSC_FALSE);
1690:   B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1691:   B->ops->destroy  = MatDestroy_SeqDenseCUDA;
1692:   return(0);
1693: }

1695: /*@C
1696:    MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.

1698:    Collective

1700:    Input Parameters:
1701: +  comm - MPI communicator
1702: .  m - number of rows
1703: .  n - number of columns
1704: -  data - optional location of GPU matrix data.  Set data=NULL for PETSc
1705:    to control matrix memory allocation.

1707:    Output Parameter:
1708: .  A - the matrix

1710:    Notes:

1712:    Level: intermediate

1714: .seealso: MatCreate(), MatCreateSeqDense()
1715: @*/
1716: PetscErrorCode  MatCreateSeqDenseCUDA(MPI_Comm comm,PetscInt m,PetscInt n,PetscScalar *data,Mat *A)
1717: {
1719:   PetscMPIInt    size;

1722:   MPI_Comm_size(comm,&size);
1723:   if (size > 1) SETERRQ1(comm,PETSC_ERR_ARG_WRONG,"Invalid communicator size %d",size);
1724:   MatCreate(comm,A);
1725:   MatSetSizes(*A,m,n,m,n);
1726:   MatSetType(*A,MATSEQDENSECUDA);
1727:   MatSeqDenseCUDASetPreallocation(*A,data);
1728:   return(0);
1729: }

1731: /*MC
1732:    MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.

1734:    Options Database Keys:
1735: . -mat_type seqdensecuda - sets the matrix type to "seqdensecuda" during a call to MatSetFromOptions()

1737:   Level: beginner
1738: M*/
1739: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1740: {

1744:   PetscCUDAInitializeCheck();
1745:   MatCreate_SeqDense(B);
1746:   MatConvert_SeqDense_SeqDenseCUDA(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);
1747:   return(0);
1748: }