Actual source code: densecuda.cu
1: /*
2: Defines the matrix operations for sequential dense with CUDA
3: */
4: #include <petscpkg_version.h>
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
6: #include <../src/mat/impls/dense/seq/dense.h>
7: #include <petsc/private/cudavecimpl.h>
8: #include <thrust/device_ptr.h>
9: #include <thrust/functional.h>
10: #include <thrust/iterator/counting_iterator.h>
11: #include <thrust/iterator/transform_iterator.h>
12: #include <thrust/iterator/permutation_iterator.h>
13: #include <thrust/transform.h>
14: #include <thrust/device_vector.h>
16: #if defined(PETSC_USE_COMPLEX)
17: #if defined(PETSC_USE_REAL_SINGLE)
18: #define cusolverDnXpotrf(a, b, c, d, e, f, g, h) cusolverDnCpotrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
19: #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f) cusolverDnCpotrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
20: #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i) cusolverDnCpotrs((a), (b), (c), (d), (cuComplex *)(e), (f), (cuComplex *)(g), (h), (i))
21: #define cusolverDnXpotri(a, b, c, d, e, f, g, h) cusolverDnCpotri((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
22: #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f) cusolverDnCpotri_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
23: #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i) cusolverDnCsytrf((a), (b), (c), (cuComplex *)(d), (e), (f), (cuComplex *)(g), (h), (i))
24: #define cusolverDnXsytrf_bufferSize(a, b, c, d, e) cusolverDnCsytrf_bufferSize((a), (b), (cuComplex *)(c), (d), (e))
25: #define cusolverDnXgetrf(a, b, c, d, e, f, g, h) cusolverDnCgetrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (g), (h))
26: #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f) cusolverDnCgetrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
27: #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j) cusolverDnCgetrs((a), (b), (c), (d), (cuComplex *)(e), (f), (g), (cuComplex *)(h), (i), (j))
28: #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f) cusolverDnCgeqrf_bufferSize((a), (b), (c), (cuComplex *)(d), (e), (f))
29: #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i) cusolverDnCgeqrf((a), (b), (c), (cuComplex *)(d), (e), (cuComplex *)(f), (cuComplex *)(g), (h), (i))
30: #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnCunmqr_bufferSize((a), (b), (c), (d), (e), (f), (cuComplex *)(g), (h), (cuComplex *)(i), (cuComplex *)(j), (k), (l))
31: #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusolverDnCunmqr((a), (b), (c), (d), (e), (f), (cuComplex *)(g), (h), (cuComplex *)(i), (cuComplex *)(j), (k), (cuComplex *)(l), (m), (n))
32: #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) cublasCtrsm((a), (b), (c), (d), (e), (f), (g), (cuComplex *)(h), (cuComplex *)(i), (j), (cuComplex *)(k), (l))
33: #else /* complex double */
34: #define cusolverDnXpotrf(a, b, c, d, e, f, g, h) cusolverDnZpotrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
35: #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f) cusolverDnZpotrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
36: #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i) cusolverDnZpotrs((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (cuDoubleComplex *)(g), (h), (i))
37: #define cusolverDnXpotri(a, b, c, d, e, f, g, h) cusolverDnZpotri((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
38: #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f) cusolverDnZpotri_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
39: #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i) cusolverDnZsytrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (f), (cuDoubleComplex *)(g), (h), (i))
40: #define cusolverDnXsytrf_bufferSize(a, b, c, d, e) cusolverDnZsytrf_bufferSize((a), (b), (cuDoubleComplex *)(c), (d), (e))
41: #define cusolverDnXgetrf(a, b, c, d, e, f, g, h) cusolverDnZgetrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (g), (h))
42: #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f) cusolverDnZgetrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
43: #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j) cusolverDnZgetrs((a), (b), (c), (d), (cuDoubleComplex *)(e), (f), (g), (cuDoubleComplex *)(h), (i), (j))
44: #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f) cusolverDnZgeqrf_bufferSize((a), (b), (c), (cuDoubleComplex *)(d), (e), (f))
45: #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i) cusolverDnZgeqrf((a), (b), (c), (cuDoubleComplex *)(d), (e), (cuDoubleComplex *)(f), (cuDoubleComplex *)(g), (h), (i))
46: #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnZunmqr_bufferSize((a), (b), (c), (d), (e), (f), (cuDoubleComplex *)(g), (h), (cuDoubleComplex *)(i), (cuDoubleComplex *)(j), (k), (l))
47: #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusolverDnZunmqr((a), (b), (c), (d), (e), (f), (cuDoubleComplex *)(g), (h), (cuDoubleComplex *)(i), (cuDoubleComplex *)(j), (k), (cuDoubleComplex *)(l), (m), (n))
48: #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) cublasZtrsm((a), (b), (c), (d), (e), (f), (g), (cuDoubleComplex *)(h), (cuDoubleComplex *)(i), (j), (cuDoubleComplex *)(k), (l))
49: #endif
50: #else /* real single */
51: #if defined(PETSC_USE_REAL_SINGLE)
52: #define cusolverDnXpotrf(a, b, c, d, e, f, g, h) cusolverDnSpotrf((a), (b), (c), (d), (e), (f), (g), (h))
53: #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f) cusolverDnSpotrf_bufferSize((a), (b), (c), (d), (e), (f))
54: #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i) cusolverDnSpotrs((a), (b), (c), (d), (e), (f), (g), (h), (i))
55: #define cusolverDnXpotri(a, b, c, d, e, f, g, h) cusolverDnSpotri((a), (b), (c), (d), (e), (f), (g), (h))
56: #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f) cusolverDnSpotri_bufferSize((a), (b), (c), (d), (e), (f))
57: #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i) cusolverDnSsytrf((a), (b), (c), (d), (e), (f), (g), (h), (i))
58: #define cusolverDnXsytrf_bufferSize(a, b, c, d, e) cusolverDnSsytrf_bufferSize((a), (b), (c), (d), (e))
59: #define cusolverDnXgetrf(a, b, c, d, e, f, g, h) cusolverDnSgetrf((a), (b), (c), (d), (e), (f), (g), (h))
60: #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f) cusolverDnSgetrf_bufferSize((a), (b), (c), (d), (e), (f))
61: #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j) cusolverDnSgetrs((a), (b), (c), (d), (e), (f), (g), (h), (i), (j))
62: #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f) cusolverDnSgeqrf_bufferSize((a), (b), (c), (float *)(d), (e), (f))
63: #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i) cusolverDnSgeqrf((a), (b), (c), (float *)(d), (e), (float *)(f), (float *)(g), (h), (i))
64: #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnSormqr_bufferSize((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (l))
65: #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusolverDnSormqr((a), (b), (c), (d), (e), (f), (float *)(g), (h), (float *)(i), (float *)(j), (k), (float *)(l), (m), (n))
66: #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) cublasStrsm((a), (b), (c), (d), (e), (f), (g), (float *)(h), (float *)(i), (j), (float *)(k), (l))
67: #else /* real double */
68: #define cusolverDnXpotrf(a, b, c, d, e, f, g, h) cusolverDnDpotrf((a), (b), (c), (d), (e), (f), (g), (h))
69: #define cusolverDnXpotrf_bufferSize(a, b, c, d, e, f) cusolverDnDpotrf_bufferSize((a), (b), (c), (d), (e), (f))
70: #define cusolverDnXpotrs(a, b, c, d, e, f, g, h, i) cusolverDnDpotrs((a), (b), (c), (d), (e), (f), (g), (h), (i))
71: #define cusolverDnXpotri(a, b, c, d, e, f, g, h) cusolverDnDpotri((a), (b), (c), (d), (e), (f), (g), (h))
72: #define cusolverDnXpotri_bufferSize(a, b, c, d, e, f) cusolverDnDpotri_bufferSize((a), (b), (c), (d), (e), (f))
73: #define cusolverDnXsytrf(a, b, c, d, e, f, g, h, i) cusolverDnDsytrf((a), (b), (c), (d), (e), (f), (g), (h), (i))
74: #define cusolverDnXsytrf_bufferSize(a, b, c, d, e) cusolverDnDsytrf_bufferSize((a), (b), (c), (d), (e))
75: #define cusolverDnXgetrf(a, b, c, d, e, f, g, h) cusolverDnDgetrf((a), (b), (c), (d), (e), (f), (g), (h))
76: #define cusolverDnXgetrf_bufferSize(a, b, c, d, e, f) cusolverDnDgetrf_bufferSize((a), (b), (c), (d), (e), (f))
77: #define cusolverDnXgetrs(a, b, c, d, e, f, g, h, i, j) cusolverDnDgetrs((a), (b), (c), (d), (e), (f), (g), (h), (i), (j))
78: #define cusolverDnXgeqrf_bufferSize(a, b, c, d, e, f) cusolverDnDgeqrf_bufferSize((a), (b), (c), (double *)(d), (e), (f))
79: #define cusolverDnXgeqrf(a, b, c, d, e, f, g, h, i) cusolverDnDgeqrf((a), (b), (c), (double *)(d), (e), (double *)(f), (double *)(g), (h), (i))
80: #define cusolverDnXormqr_bufferSize(a, b, c, d, e, f, g, h, i, j, k, l) cusolverDnDormqr_bufferSize((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (l))
81: #define cusolverDnXormqr(a, b, c, d, e, f, g, h, i, j, k, l, m, n) cusolverDnDormqr((a), (b), (c), (d), (e), (f), (double *)(g), (h), (double *)(i), (double *)(j), (k), (double *)(l), (m), (n))
82: #define cublasXtrsm(a, b, c, d, e, f, g, h, i, j, k, l) cublasDtrsm((a), (b), (c), (d), (e), (f), (g), (double *)(h), (double *)(i), (j), (double *)(k), (l))
83: #endif
84: #endif
86: typedef struct {
87: PetscScalar *d_v; /* pointer to the matrix on the GPU */
88: PetscBool user_alloc;
89: PetscScalar *unplacedarray; /* if one called MatCUDADensePlaceArray(), this is where it stashed the original */
90: PetscBool unplaced_user_alloc;
91: /* factorization support */
92: PetscCuBLASInt *d_fact_ipiv; /* device pivots */
93: PetscScalar *d_fact_tau; /* device QR tau vector */
94: PetscScalar *d_fact_work; /* device workspace */
95: PetscCuBLASInt fact_lwork;
96: PetscCuBLASInt *d_fact_info; /* device info */
97: /* workspace */
98: Vec workvec;
99: } Mat_SeqDenseCUDA;
101: PetscErrorCode MatSeqDenseCUDASetPreallocation(Mat A, PetscScalar *d_data)
102: {
103: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
104: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
105: PetscBool iscuda;
107: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &iscuda);
108: if (!iscuda) return 0;
109: PetscLayoutSetUp(A->rmap);
110: PetscLayoutSetUp(A->cmap);
111: /* it may happen CPU preallocation has not been performed */
112: if (cA->lda <= 0) cA->lda = A->rmap->n;
113: if (!dA->user_alloc) cudaFree(dA->d_v);
114: if (!d_data) { /* petsc-allocated storage */
115: size_t sz;
117: PetscIntMultError(cA->lda, A->cmap->n, NULL);
118: sz = cA->lda * A->cmap->n * sizeof(PetscScalar);
119: cudaMalloc((void **)&dA->d_v, sz);
120: cudaMemset(dA->d_v, 0, sz);
121: dA->user_alloc = PETSC_FALSE;
122: } else { /* user-allocated storage */
123: dA->d_v = d_data;
124: dA->user_alloc = PETSC_TRUE;
125: }
126: A->offloadmask = PETSC_OFFLOAD_GPU;
127: A->preallocated = PETSC_TRUE;
128: A->assembled = PETSC_TRUE;
129: return 0;
130: }
132: PetscErrorCode MatSeqDenseCUDACopyFromGPU(Mat A)
133: {
134: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
135: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
138: PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", A->offloadmask == PETSC_OFFLOAD_GPU ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
139: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
140: if (!cA->v) { /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
141: MatSeqDenseSetPreallocation(A, NULL);
142: }
143: PetscLogEventBegin(MAT_DenseCopyFromGPU, A, 0, 0, 0);
144: if (cA->lda > A->rmap->n) {
145: cudaMemcpy2D(cA->v, cA->lda * sizeof(PetscScalar), dA->d_v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyDeviceToHost);
146: } else {
147: cudaMemcpy(cA->v, dA->d_v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyDeviceToHost);
148: }
149: PetscLogGpuToCpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
150: PetscLogEventEnd(MAT_DenseCopyFromGPU, A, 0, 0, 0);
152: A->offloadmask = PETSC_OFFLOAD_BOTH;
153: }
154: return 0;
155: }
157: PetscErrorCode MatSeqDenseCUDACopyToGPU(Mat A)
158: {
159: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
160: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
161: PetscBool copy;
164: if (A->boundtocpu) return 0;
165: copy = (PetscBool)(A->offloadmask == PETSC_OFFLOAD_CPU || A->offloadmask == PETSC_OFFLOAD_UNALLOCATED);
166: PetscInfo(A, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", A->rmap->n, A->cmap->n);
167: if (copy) {
168: if (!dA->d_v) { /* Allocate GPU memory if not present */
169: MatSeqDenseCUDASetPreallocation(A, NULL);
170: }
171: PetscLogEventBegin(MAT_DenseCopyToGPU, A, 0, 0, 0);
172: if (cA->lda > A->rmap->n) {
173: cudaMemcpy2D(dA->d_v, cA->lda * sizeof(PetscScalar), cA->v, cA->lda * sizeof(PetscScalar), A->rmap->n * sizeof(PetscScalar), A->cmap->n, cudaMemcpyHostToDevice);
174: } else {
175: cudaMemcpy(dA->d_v, cA->v, cA->lda * sizeof(PetscScalar) * A->cmap->n, cudaMemcpyHostToDevice);
176: }
177: PetscLogCpuToGpu(cA->lda * sizeof(PetscScalar) * A->cmap->n);
178: PetscLogEventEnd(MAT_DenseCopyToGPU, A, 0, 0, 0);
180: A->offloadmask = PETSC_OFFLOAD_BOTH;
181: }
182: return 0;
183: }
185: static PetscErrorCode MatCopy_SeqDenseCUDA(Mat A, Mat B, MatStructure str)
186: {
187: const PetscScalar *va;
188: PetscScalar *vb;
189: PetscInt lda1, lda2, m = A->rmap->n, n = A->cmap->n;
191: /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
192: if (A->ops->copy != B->ops->copy) {
193: MatCopy_Basic(A, B, str);
194: return 0;
195: }
197: MatDenseCUDAGetArrayRead(A, &va);
198: MatDenseCUDAGetArrayWrite(B, &vb);
199: MatDenseGetLDA(A, &lda1);
200: MatDenseGetLDA(B, &lda2);
201: PetscLogGpuTimeBegin();
202: if (lda1 > m || lda2 > m) {
203: cudaMemcpy2D(vb, lda2 * sizeof(PetscScalar), va, lda1 * sizeof(PetscScalar), m * sizeof(PetscScalar), n, cudaMemcpyDeviceToDevice);
204: } else {
205: cudaMemcpy(vb, va, m * (n * sizeof(PetscScalar)), cudaMemcpyDeviceToDevice);
206: }
207: PetscLogGpuTimeEnd();
208: MatDenseCUDARestoreArrayWrite(B, &vb);
209: MatDenseCUDARestoreArrayRead(A, &va);
210: return 0;
211: }
213: static PetscErrorCode MatZeroEntries_SeqDenseCUDA(Mat A)
214: {
215: PetscScalar *va;
216: PetscInt lda, m = A->rmap->n, n = A->cmap->n;
218: MatDenseCUDAGetArrayWrite(A, &va);
219: MatDenseGetLDA(A, &lda);
220: PetscLogGpuTimeBegin();
221: if (lda > m) {
222: cudaMemset2D(va, lda * sizeof(PetscScalar), 0, m * sizeof(PetscScalar), n);
223: } else {
224: cudaMemset(va, 0, m * (n * sizeof(PetscScalar)));
225: }
226: PetscLogGpuTimeEnd();
227: MatDenseCUDARestoreArrayWrite(A, &va);
228: return 0;
229: }
231: static PetscErrorCode MatDenseCUDAPlaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
232: {
233: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
234: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
239: if (aa->v) MatSeqDenseCUDACopyToGPU(A);
240: dA->unplacedarray = dA->d_v;
241: dA->unplaced_user_alloc = dA->user_alloc;
242: dA->d_v = (PetscScalar *)a;
243: dA->user_alloc = PETSC_TRUE;
244: return 0;
245: }
247: static PetscErrorCode MatDenseCUDAResetArray_SeqDenseCUDA(Mat A)
248: {
249: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
250: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
254: if (a->v) MatSeqDenseCUDACopyToGPU(A);
255: dA->d_v = dA->unplacedarray;
256: dA->user_alloc = dA->unplaced_user_alloc;
257: dA->unplacedarray = NULL;
258: return 0;
259: }
261: static PetscErrorCode MatDenseCUDAReplaceArray_SeqDenseCUDA(Mat A, const PetscScalar *a)
262: {
263: Mat_SeqDense *aa = (Mat_SeqDense *)A->data;
264: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
269: if (!dA->user_alloc) cudaFree(dA->d_v);
270: dA->d_v = (PetscScalar *)a;
271: dA->user_alloc = PETSC_FALSE;
272: return 0;
273: }
275: static PetscErrorCode MatDenseCUDAGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
276: {
277: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
279: if (!dA->d_v) MatSeqDenseCUDASetPreallocation(A, NULL);
280: *a = dA->d_v;
281: return 0;
282: }
284: static PetscErrorCode MatDenseCUDARestoreArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **a)
285: {
286: if (a) *a = NULL;
287: return 0;
288: }
290: static PetscErrorCode MatDenseCUDAGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
291: {
292: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
294: MatSeqDenseCUDACopyToGPU(A);
295: *a = dA->d_v;
296: return 0;
297: }
299: static PetscErrorCode MatDenseCUDARestoreArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **a)
300: {
301: if (a) *a = NULL;
302: return 0;
303: }
305: static PetscErrorCode MatDenseCUDAGetArray_SeqDenseCUDA(Mat A, PetscScalar **a)
306: {
307: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
309: MatSeqDenseCUDACopyToGPU(A);
310: *a = dA->d_v;
311: return 0;
312: }
314: static PetscErrorCode MatDenseCUDARestoreArray_SeqDenseCUDA(Mat A, PetscScalar **a)
315: {
316: if (a) *a = NULL;
317: return 0;
318: }
320: PETSC_EXTERN PetscErrorCode MatSeqDenseCUDAInvertFactors_Private(Mat A)
321: {
322: #if PETSC_PKG_CUDA_VERSION_GE(10, 1, 0)
323: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
324: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
325: PetscScalar *da;
326: cusolverDnHandle_t handle;
327: PetscCuBLASInt n, lda;
328: #if defined(PETSC_USE_DEBUG)
329: PetscCuBLASInt info;
330: #endif
332: if (!A->rmap->n || !A->cmap->n) return 0;
333: PetscCUSOLVERDnGetHandle(&handle);
334: PetscCuBLASIntCast(A->cmap->n, &n);
335: PetscCuBLASIntCast(a->lda, &lda);
337: if (A->factortype == MAT_FACTOR_CHOLESKY) {
338: if (!dA->d_fact_ipiv) { /* spd */
339: PetscCuBLASInt il;
341: MatDenseCUDAGetArray(A, &da);
342: cusolverDnXpotri_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &il);
343: if (il > dA->fact_lwork) {
344: dA->fact_lwork = il;
346: cudaFree(dA->d_fact_work);
347: cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
348: }
349: PetscLogGpuTimeBegin();
350: cusolverDnXpotri(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
351: PetscLogGpuTimeEnd();
352: MatDenseCUDARestoreArray(A, &da);
353: /* TODO (write cuda kernel) */
354: MatSeqDenseSymmetrize_Private(A, PETSC_TRUE);
355: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytri not implemented");
356: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Not implemented");
357: #if defined(PETSC_USE_DEBUG)
358: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
361: #endif
362: PetscLogGpuFlops(1.0 * n * n * n / 3.0);
363: A->ops->solve = NULL;
364: A->ops->solvetranspose = NULL;
365: A->ops->matsolve = NULL;
366: A->factortype = MAT_FACTOR_NONE;
368: PetscFree(A->solvertype);
369: return 0;
370: #else
371: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
372: #endif
373: }
375: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal(Mat A, Vec xx, Vec yy, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
376: {
377: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
378: PetscScalar *y;
379: PetscCuBLASInt m = 0, k = 0;
380: PetscBool xiscuda, yiscuda, aiscuda;
383: PetscCuBLASIntCast(A->rmap->n, &m);
384: PetscCuBLASIntCast(A->cmap->n, &k);
385: PetscObjectTypeCompare((PetscObject)xx, VECSEQCUDA, &xiscuda);
386: PetscObjectTypeCompare((PetscObject)yy, VECSEQCUDA, &yiscuda);
387: {
388: const PetscScalar *x;
389: PetscBool xishost = PETSC_TRUE;
391: /* The logic here is to try to minimize the amount of memory copying:
392: if we call VecCUDAGetArrayRead(X,&x) every time xiscuda and the
393: data is not offloaded to the GPU yet, then the data is copied to the
394: GPU. But we are only trying to get the data in order to copy it into the y
395: array. So the array x will be wherever the data already is so that
396: only one memcpy is performed */
397: if (xiscuda && xx->offloadmask & PETSC_OFFLOAD_GPU) {
398: VecCUDAGetArrayRead(xx, &x);
399: xishost = PETSC_FALSE;
400: } else {
401: VecGetArrayRead(xx, &x);
402: }
403: if (k < m || !yiscuda) {
404: if (!dA->workvec) VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
405: VecCUDAGetArrayWrite(dA->workvec, &y);
406: } else {
407: VecCUDAGetArrayWrite(yy, &y);
408: }
409: cudaMemcpy(y, x, m * sizeof(PetscScalar), xishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);
410: }
411: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda);
412: if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
413: (*matsolve)(A, y, m, m, 1, k, transpose);
414: if (!aiscuda) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
415: if (k < m || !yiscuda) {
416: PetscScalar *yv;
418: /* The logic here is that the data is not yet in either yy's GPU array or its
419: CPU array. There is nothing in the interface to say where the user would like
420: it to end up. So we choose the GPU, because it is the faster option */
421: if (yiscuda) {
422: VecCUDAGetArrayWrite(yy, &yv);
423: } else {
424: VecGetArray(yy, &yv);
425: }
426: cudaMemcpy(yv, y, k * sizeof(PetscScalar), yiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
427: if (yiscuda) {
428: VecCUDARestoreArrayWrite(yy, &yv);
429: } else {
430: VecRestoreArray(yy, &yv);
431: }
432: VecCUDARestoreArrayWrite(dA->workvec, &y);
433: } else {
434: VecCUDARestoreArrayWrite(yy, &y);
435: }
436: return 0;
437: }
439: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Internal(Mat A, Mat B, Mat X, PetscBool transpose, PetscErrorCode (*matsolve)(Mat, PetscScalar *, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscCuBLASInt, PetscBool))
440: {
441: PetscScalar *y;
442: PetscInt n, _ldb, _ldx;
443: PetscBool biscuda, xiscuda, aiscuda;
444: PetscCuBLASInt nrhs = 0, m = 0, k = 0, ldb = 0, ldx = 0, ldy = 0;
447: PetscCuBLASIntCast(A->rmap->n, &m);
448: PetscCuBLASIntCast(A->cmap->n, &k);
449: MatGetSize(B, NULL, &n);
450: PetscCuBLASIntCast(n, &nrhs);
451: MatDenseGetLDA(B, &_ldb);
452: PetscCuBLASIntCast(_ldb, &ldb);
453: MatDenseGetLDA(X, &_ldx);
454: PetscCuBLASIntCast(_ldx, &ldx);
456: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda);
457: PetscObjectTypeCompare((PetscObject)X, MATSEQDENSECUDA, &xiscuda);
458: {
459: /* The logic here is to try to minimize the amount of memory copying:
460: if we call MatDenseCUDAGetArrayRead(B,&b) every time biscuda and the
461: data is not offloaded to the GPU yet, then the data is copied to the
462: GPU. But we are only trying to get the data in order to copy it into the y
463: array. So the array b will be wherever the data already is so that
464: only one memcpy is performed */
465: const PetscScalar *b;
467: /* some copying from B will be involved */
468: PetscBool bishost = PETSC_TRUE;
470: if (biscuda && B->offloadmask & PETSC_OFFLOAD_GPU) {
471: MatDenseCUDAGetArrayRead(B, &b);
472: bishost = PETSC_FALSE;
473: } else {
474: MatDenseGetArrayRead(B, &b);
475: }
476: if (ldx < m || !xiscuda) {
477: /* X's array cannot serve as the array (too small or not on device), B's
478: * array cannot serve as the array (const), so allocate a new array */
479: ldy = m;
480: cudaMalloc((void **)&y, nrhs * m * sizeof(PetscScalar));
481: } else {
482: /* X's array should serve as the array */
483: ldy = ldx;
484: MatDenseCUDAGetArrayWrite(X, &y);
485: }
486: cudaMemcpy2D(y, ldy * sizeof(PetscScalar), b, ldb * sizeof(PetscScalar), m * sizeof(PetscScalar), nrhs, bishost ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice);
487: if (bishost) {
488: MatDenseRestoreArrayRead(B, &b);
489: } else {
490: MatDenseCUDARestoreArrayRead(B, &b);
491: }
492: }
493: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &aiscuda);
494: if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
495: (*matsolve)(A, y, ldy, m, nrhs, k, transpose);
496: if (!aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
497: if (ldx < m || !xiscuda) {
498: PetscScalar *x;
500: /* The logic here is that the data is not yet in either X's GPU array or its
501: CPU array. There is nothing in the interface to say where the user would like
502: it to end up. So we choose the GPU, because it is the faster option */
503: if (xiscuda) {
504: MatDenseCUDAGetArrayWrite(X, &x);
505: } else {
506: MatDenseGetArray(X, &x);
507: }
508: cudaMemcpy2D(x, ldx * sizeof(PetscScalar), y, ldy * sizeof(PetscScalar), k * sizeof(PetscScalar), nrhs, xiscuda ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);
509: if (xiscuda) {
510: MatDenseCUDARestoreArrayWrite(X, &x);
511: } else {
512: MatDenseRestoreArray(X, &x);
513: }
514: cudaFree(y);
515: } else {
516: MatDenseCUDARestoreArrayWrite(X, &y);
517: }
518: return 0;
519: }
521: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_LU(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
522: {
523: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
524: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
525: const PetscScalar *da;
526: PetscCuBLASInt lda;
527: cusolverDnHandle_t handle;
528: int info;
530: MatDenseCUDAGetArrayRead(A, &da);
531: PetscCuBLASIntCast(mat->lda, &lda);
532: PetscCUSOLVERDnGetHandle(&handle);
533: PetscLogGpuTimeBegin();
534: PetscInfo(A, "LU solve %d x %d on backend\n", m, k);
535: cusolverDnXgetrs(handle, T ? CUBLAS_OP_T : CUBLAS_OP_N, m, nrhs, da, lda, dA->d_fact_ipiv, x, ldx, dA->d_fact_info);
536: PetscLogGpuTimeEnd();
537: MatDenseCUDARestoreArrayRead(A, &da);
538: if (PetscDefined(USE_DEBUG)) {
539: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
542: }
543: PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
544: return 0;
545: }
547: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_Cholesky(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
548: {
549: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
550: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
551: const PetscScalar *da;
552: PetscCuBLASInt lda;
553: cusolverDnHandle_t handle;
554: int info;
556: MatDenseCUDAGetArrayRead(A, &da);
557: PetscCuBLASIntCast(mat->lda, &lda);
558: PetscCUSOLVERDnGetHandle(&handle);
559: PetscLogGpuTimeBegin();
560: PetscInfo(A, "Cholesky solve %d x %d on backend\n", m, k);
561: if (!dA->d_fact_ipiv) { /* spd */
562: /* ========= Program hit cudaErrorNotReady (error 34) due to "device not ready" on CUDA API call to cudaEventQuery. */
563: cusolverDnXpotrs(handle, CUBLAS_FILL_MODE_LOWER, m, nrhs, da, lda, x, ldx, dA->d_fact_info);
564: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "cusolverDnsytrs not implemented");
565: PetscLogGpuTimeEnd();
566: MatDenseCUDARestoreArrayRead(A, &da);
567: if (PetscDefined(USE_DEBUG)) {
568: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
571: }
572: PetscLogGpuFlops(nrhs * (2.0 * m * m - m));
573: return 0;
574: }
576: static PetscErrorCode MatSolve_SeqDenseCUDA_Internal_QR(Mat A, PetscScalar *x, PetscCuBLASInt ldx, PetscCuBLASInt m, PetscCuBLASInt nrhs, PetscCuBLASInt k, PetscBool T)
577: {
578: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
579: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
580: const PetscScalar *da;
581: PetscCuBLASInt lda, rank;
582: cusolverDnHandle_t handle;
583: cublasHandle_t bhandle;
584: int info;
585: cublasOperation_t trans;
586: PetscScalar one = 1.;
588: PetscCuBLASIntCast(mat->rank, &rank);
589: MatDenseCUDAGetArrayRead(A, &da);
590: PetscCuBLASIntCast(mat->lda, &lda);
591: PetscCUSOLVERDnGetHandle(&handle);
592: PetscCUBLASGetHandle(&bhandle);
593: PetscLogGpuTimeBegin();
594: PetscInfo(A, "QR solve %d x %d on backend\n", m, k);
595: if (!T) {
596: if (PetscDefined(USE_COMPLEX)) {
597: trans = CUBLAS_OP_C;
598: } else {
599: trans = CUBLAS_OP_T;
600: }
601: cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, trans, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
602: if (PetscDefined(USE_DEBUG)) {
603: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
605: }
606: cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
607: } else {
608: cublasXtrsm(bhandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da, lda, x, ldx);
609: cusolverDnXormqr(handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, nrhs, rank, da, lda, dA->d_fact_tau, x, ldx, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
610: if (PetscDefined(USE_DEBUG)) {
611: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
613: }
614: }
615: PetscLogGpuTimeEnd();
616: MatDenseCUDARestoreArrayRead(A, &da);
617: PetscLogFlops(nrhs * (4.0 * m * mat->rank - PetscSqr(mat->rank)));
618: return 0;
619: }
621: static PetscErrorCode MatSolve_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
622: {
623: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
624: return 0;
625: }
627: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_LU(Mat A, Vec xx, Vec yy)
628: {
629: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
630: return 0;
631: }
633: static PetscErrorCode MatSolve_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
634: {
635: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
636: return 0;
637: }
639: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Vec xx, Vec yy)
640: {
641: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
642: return 0;
643: }
645: static PetscErrorCode MatSolve_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
646: {
647: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
648: return 0;
649: }
651: static PetscErrorCode MatSolveTranspose_SeqDenseCUDA_QR(Mat A, Vec xx, Vec yy)
652: {
653: MatSolve_SeqDenseCUDA_Internal(A, xx, yy, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
654: return 0;
655: }
657: static PetscErrorCode MatMatSolve_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
658: {
659: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_LU);
660: return 0;
661: }
663: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_LU(Mat A, Mat B, Mat X)
664: {
665: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_LU);
666: return 0;
667: }
669: static PetscErrorCode MatMatSolve_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
670: {
671: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
672: return 0;
673: }
675: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_Cholesky(Mat A, Mat B, Mat X)
676: {
677: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_Cholesky);
678: return 0;
679: }
681: static PetscErrorCode MatMatSolve_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
682: {
683: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_FALSE, MatSolve_SeqDenseCUDA_Internal_QR);
684: return 0;
685: }
687: static PetscErrorCode MatMatSolveTranspose_SeqDenseCUDA_QR(Mat A, Mat B, Mat X)
688: {
689: MatMatSolve_SeqDenseCUDA_Internal(A, B, X, PETSC_TRUE, MatSolve_SeqDenseCUDA_Internal_QR);
690: return 0;
691: }
693: static PetscErrorCode MatLUFactor_SeqDenseCUDA(Mat A, IS rperm, IS cperm, const MatFactorInfo *factinfo)
694: {
695: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
696: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
697: PetscScalar *da;
698: PetscCuBLASInt m, n, lda;
699: #if defined(PETSC_USE_DEBUG)
700: int info;
701: #endif
702: cusolverDnHandle_t handle;
704: if (!A->rmap->n || !A->cmap->n) return 0;
705: PetscCUSOLVERDnGetHandle(&handle);
706: MatDenseCUDAGetArray(A, &da);
707: PetscCuBLASIntCast(A->cmap->n, &n);
708: PetscCuBLASIntCast(A->rmap->n, &m);
709: PetscCuBLASIntCast(a->lda, &lda);
710: PetscInfo(A, "LU factor %d x %d on backend\n", m, n);
711: if (!dA->d_fact_ipiv) cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
712: if (!dA->fact_lwork) {
713: cusolverDnXgetrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
714: cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
715: }
716: if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
717: PetscLogGpuTimeBegin();
718: cusolverDnXgetrf(handle, m, n, da, lda, dA->d_fact_work, dA->d_fact_ipiv, dA->d_fact_info);
719: PetscLogGpuTimeEnd();
720: MatDenseCUDARestoreArray(A, &da);
721: #if defined(PETSC_USE_DEBUG)
722: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
725: #endif
726: A->factortype = MAT_FACTOR_LU;
727: PetscLogGpuFlops(2.0 * n * n * m / 3.0);
729: A->ops->solve = MatSolve_SeqDenseCUDA_LU;
730: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_LU;
731: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_LU;
732: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_LU;
734: PetscFree(A->solvertype);
735: PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
736: return 0;
737: }
739: static PetscErrorCode MatCholeskyFactor_SeqDenseCUDA(Mat A, IS perm, const MatFactorInfo *factinfo)
740: {
741: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
742: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
743: PetscScalar *da;
744: PetscCuBLASInt n, lda;
745: #if defined(PETSC_USE_DEBUG)
746: int info;
747: #endif
748: cusolverDnHandle_t handle;
750: if (!A->rmap->n || !A->cmap->n) return 0;
751: PetscCUSOLVERDnGetHandle(&handle);
752: PetscCuBLASIntCast(A->rmap->n, &n);
753: PetscInfo(A, "Cholesky factor %d x %d on backend\n", n, n);
754: if (A->spd == PETSC_BOOL3_TRUE) {
755: MatDenseCUDAGetArray(A, &da);
756: PetscCuBLASIntCast(a->lda, &lda);
757: if (!dA->fact_lwork) {
758: cusolverDnXpotrf_bufferSize(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, &dA->fact_lwork);
759: cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
760: }
761: if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
762: PetscLogGpuTimeBegin();
763: cusolverDnXpotrf(handle, CUBLAS_FILL_MODE_LOWER, n, da, lda, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
764: PetscLogGpuTimeEnd();
766: MatDenseCUDARestoreArray(A, &da);
767: #if defined(PETSC_USE_DEBUG)
768: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
771: #endif
772: A->factortype = MAT_FACTOR_CHOLESKY;
773: PetscLogGpuFlops(1.0 * n * n * n / 3.0);
774: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cusolverDnsytrs unavailable. Use MAT_FACTOR_LU");
775: #if 0
776: /* at the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs and *hetr* routines
777: The code below should work, and it can be activated when *sytrs routines will be available */
778: if (!dA->d_fact_ipiv) {
779: cudaMalloc((void**)&dA->d_fact_ipiv,n*sizeof(*dA->d_fact_ipiv));
780: }
781: if (!dA->fact_lwork) {
782: cusolverDnXsytrf_bufferSize(handle,n,da,lda,&dA->fact_lwork);
783: cudaMalloc((void**)&dA->d_fact_work,dA->fact_lwork*sizeof(*dA->d_fact_work));
784: }
785: if (!dA->d_fact_info) {
786: cudaMalloc((void**)&dA->d_fact_info,sizeof(*dA->d_fact_info));
787: }
788: PetscLogGpuTimeBegin();
789: cusolverDnXsytrf(handle,CUBLAS_FILL_MODE_LOWER,n,da,lda,dA->d_fact_ipiv,dA->d_fact_work,dA->fact_lwork,dA->d_fact_info);
790: PetscLogGpuTimeEnd();
791: #endif
793: A->ops->solve = MatSolve_SeqDenseCUDA_Cholesky;
794: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_Cholesky;
795: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_Cholesky;
796: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_Cholesky;
797: PetscFree(A->solvertype);
798: PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
799: return 0;
800: }
802: static PetscErrorCode MatQRFactor_SeqDenseCUDA(Mat A, IS col, const MatFactorInfo *factinfo)
803: {
804: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
805: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
806: PetscScalar *da;
807: PetscCuBLASInt m, min, max, n, lda;
808: #if defined(PETSC_USE_DEBUG)
809: int info;
810: #endif
811: cusolverDnHandle_t handle;
813: if (!A->rmap->n || !A->cmap->n) return 0;
814: PetscCUSOLVERDnGetHandle(&handle);
815: MatDenseCUDAGetArray(A, &da);
816: PetscCuBLASIntCast(A->cmap->n, &n);
817: PetscCuBLASIntCast(A->rmap->n, &m);
818: PetscCuBLASIntCast(a->lda, &lda);
819: PetscInfo(A, "QR factor %d x %d on backend\n", m, n);
820: max = PetscMax(m, n);
821: min = PetscMin(m, n);
822: if (!dA->d_fact_tau) cudaMalloc((void **)&dA->d_fact_tau, min * sizeof(*dA->d_fact_tau));
823: if (!dA->d_fact_ipiv) cudaMalloc((void **)&dA->d_fact_ipiv, n * sizeof(*dA->d_fact_ipiv));
824: if (!dA->fact_lwork) {
825: cusolverDnXgeqrf_bufferSize(handle, m, n, da, lda, &dA->fact_lwork);
826: cudaMalloc((void **)&dA->d_fact_work, dA->fact_lwork * sizeof(*dA->d_fact_work));
827: }
828: if (!dA->d_fact_info) cudaMalloc((void **)&dA->d_fact_info, sizeof(*dA->d_fact_info));
829: if (!dA->workvec) VecCreateSeqCUDA(PetscObjectComm((PetscObject)A), m, &(dA->workvec));
830: PetscLogGpuTimeBegin();
831: cusolverDnXgeqrf(handle, m, n, da, lda, dA->d_fact_tau, dA->d_fact_work, dA->fact_lwork, dA->d_fact_info);
832: PetscLogGpuTimeEnd();
833: MatDenseCUDARestoreArray(A, &da);
834: #if defined(PETSC_USE_DEBUG)
835: cudaMemcpy(&info, dA->d_fact_info, sizeof(PetscCuBLASInt), cudaMemcpyDeviceToHost);
837: #endif
838: A->factortype = MAT_FACTOR_QR;
839: a->rank = min;
840: PetscLogGpuFlops(2.0 * min * min * (max - min / 3.0));
842: A->ops->solve = MatSolve_SeqDenseCUDA_QR;
843: A->ops->solvetranspose = MatSolveTranspose_SeqDenseCUDA_QR;
844: A->ops->matsolve = MatMatSolve_SeqDenseCUDA_QR;
845: A->ops->matsolvetranspose = MatMatSolveTranspose_SeqDenseCUDA_QR;
847: PetscFree(A->solvertype);
848: PetscStrallocpy(MATSOLVERCUDA, &A->solvertype);
849: return 0;
850: }
852: /* GEMM kernel: C = op(A)*op(B), tA, tB flag transposition */
853: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat A, Mat B, Mat C, PetscBool tA, PetscBool tB)
854: {
855: const PetscScalar *da, *db;
856: PetscScalar *dc;
857: PetscScalar one = 1.0, zero = 0.0;
858: PetscCuBLASInt m, n, k;
859: PetscInt alda, blda, clda;
860: cublasHandle_t cublasv2handle;
861: PetscBool Aiscuda, Biscuda;
862: cublasStatus_t berr;
864: /* we may end up with SEQDENSE as one of the arguments */
865: PetscObjectTypeCompare((PetscObject)A, MATSEQDENSECUDA, &Aiscuda);
866: PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &Biscuda);
867: if (!Aiscuda) MatConvert(A, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &A);
868: if (!Biscuda) MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
869: PetscCuBLASIntCast(C->rmap->n, &m);
870: PetscCuBLASIntCast(C->cmap->n, &n);
871: if (tA) PetscCuBLASIntCast(A->rmap->n, &k);
872: else PetscCuBLASIntCast(A->cmap->n, &k);
873: if (!m || !n || !k) return 0;
874: PetscInfo(C, "Matrix-Matrix product %d x %d x %d on backend\n", m, k, n);
875: MatDenseCUDAGetArrayRead(A, &da);
876: MatDenseCUDAGetArrayRead(B, &db);
877: MatDenseCUDAGetArrayWrite(C, &dc);
878: MatDenseGetLDA(A, &alda);
879: MatDenseGetLDA(B, &blda);
880: MatDenseGetLDA(C, &clda);
881: PetscCUBLASGetHandle(&cublasv2handle);
882: PetscLogGpuTimeBegin();
883: berr = cublasXgemm(cublasv2handle, tA ? CUBLAS_OP_T : CUBLAS_OP_N, tB ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &one, da, alda, db, blda, &zero, dc, clda);
884: berr;
885: PetscLogGpuTimeEnd();
886: PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1));
887: MatDenseCUDARestoreArrayRead(A, &da);
888: MatDenseCUDARestoreArrayRead(B, &db);
889: MatDenseCUDARestoreArrayWrite(C, &dc);
890: if (!Aiscuda) MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
891: if (!Biscuda) MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B);
892: return 0;
893: }
895: PetscErrorCode MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
896: {
897: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_TRUE, PETSC_FALSE);
898: return 0;
899: }
901: PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
902: {
903: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_FALSE);
904: return 0;
905: }
907: PetscErrorCode MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA(Mat A, Mat B, Mat C)
908: {
909: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(A, B, C, PETSC_FALSE, PETSC_TRUE);
910: return 0;
911: }
913: PetscErrorCode MatProductSetFromOptions_SeqDenseCUDA(Mat C)
914: {
915: MatProductSetFromOptions_SeqDense(C);
916: return 0;
917: }
919: /* zz = op(A)*xx + yy
920: if yy == NULL, only MatMult */
921: static PetscErrorCode MatMultAdd_SeqDenseCUDA_Private(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans)
922: {
923: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
924: const PetscScalar *xarray, *da;
925: PetscScalar *zarray;
926: PetscScalar one = 1.0, zero = 0.0;
927: PetscCuBLASInt m, n, lda;
928: cublasHandle_t cublasv2handle;
929: cublasStatus_t berr;
931: /* mult add */
932: if (yy && yy != zz) VecCopy_SeqCUDA(yy, zz);
933: if (!A->rmap->n || !A->cmap->n) {
934: /* mult only */
935: if (!yy) VecSet_SeqCUDA(zz, 0.0);
936: return 0;
937: }
938: PetscInfo(A, "Matrix-vector product %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", A->rmap->n, A->cmap->n);
939: PetscCuBLASIntCast(A->rmap->n, &m);
940: PetscCuBLASIntCast(A->cmap->n, &n);
941: PetscCUBLASGetHandle(&cublasv2handle);
942: MatDenseCUDAGetArrayRead(A, &da);
943: PetscCuBLASIntCast(mat->lda, &lda);
944: VecCUDAGetArrayRead(xx, &xarray);
945: VecCUDAGetArray(zz, &zarray);
946: PetscLogGpuTimeBegin();
947: berr = cublasXgemv(cublasv2handle, trans ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, &one, da, lda, xarray, 1, (yy ? &one : &zero), zarray, 1);
948: berr;
949: PetscLogGpuTimeEnd();
950: PetscLogGpuFlops(2.0 * A->rmap->n * A->cmap->n - (yy ? 0 : A->rmap->n));
951: VecCUDARestoreArrayRead(xx, &xarray);
952: VecCUDARestoreArray(zz, &zarray);
953: MatDenseCUDARestoreArrayRead(A, &da);
954: return 0;
955: }
957: PetscErrorCode MatMultAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
958: {
959: MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_FALSE);
960: return 0;
961: }
963: PetscErrorCode MatMultTransposeAdd_SeqDenseCUDA(Mat A, Vec xx, Vec yy, Vec zz)
964: {
965: MatMultAdd_SeqDenseCUDA_Private(A, xx, yy, zz, PETSC_TRUE);
966: return 0;
967: }
969: PetscErrorCode MatMult_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
970: {
971: MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_FALSE);
972: return 0;
973: }
975: PetscErrorCode MatMultTranspose_SeqDenseCUDA(Mat A, Vec xx, Vec yy)
976: {
977: MatMultAdd_SeqDenseCUDA_Private(A, xx, NULL, yy, PETSC_TRUE);
978: return 0;
979: }
981: static PetscErrorCode MatDenseGetArrayRead_SeqDenseCUDA(Mat A, const PetscScalar **array)
982: {
983: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
985: MatSeqDenseCUDACopyFromGPU(A);
986: *array = mat->v;
987: return 0;
988: }
990: static PetscErrorCode MatDenseGetArrayWrite_SeqDenseCUDA(Mat A, PetscScalar **array)
991: {
992: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
994: /* MatCreateSeqDenseCUDA may not allocate CPU memory. Allocate if needed */
995: if (!mat->v) MatSeqDenseSetPreallocation(A, NULL);
996: *array = mat->v;
997: A->offloadmask = PETSC_OFFLOAD_CPU;
998: return 0;
999: }
1001: static PetscErrorCode MatDenseGetArray_SeqDenseCUDA(Mat A, PetscScalar **array)
1002: {
1003: Mat_SeqDense *mat = (Mat_SeqDense *)A->data;
1005: MatSeqDenseCUDACopyFromGPU(A);
1006: *array = mat->v;
1007: A->offloadmask = PETSC_OFFLOAD_CPU;
1008: return 0;
1009: }
1011: PetscErrorCode MatScale_SeqDenseCUDA(Mat Y, PetscScalar alpha)
1012: {
1013: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1014: PetscScalar *dy;
1015: PetscCuBLASInt j, N, m, lday, one = 1;
1016: cublasHandle_t cublasv2handle;
1018: PetscCUBLASGetHandle(&cublasv2handle);
1019: MatDenseCUDAGetArray(Y, &dy);
1020: PetscCuBLASIntCast(Y->rmap->n * Y->cmap->n, &N);
1021: PetscCuBLASIntCast(Y->rmap->n, &m);
1022: PetscCuBLASIntCast(y->lda, &lday);
1023: PetscInfo(Y, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1024: PetscLogGpuTimeBegin();
1025: if (lday > m) {
1026: for (j = 0; j < Y->cmap->n; j++) cublasXscal(cublasv2handle, m, &alpha, dy + lday * j, one);
1027: } else cublasXscal(cublasv2handle, N, &alpha, dy, one);
1028: PetscLogGpuTimeEnd();
1029: PetscLogGpuFlops(N);
1030: MatDenseCUDARestoreArray(Y, &dy);
1031: return 0;
1032: }
1034: struct petscshift : public thrust::unary_function<PetscScalar, PetscScalar> {
1035: const PetscScalar shift_;
1036: petscshift(PetscScalar shift) : shift_(shift) { }
1037: __device__ PetscScalar operator()(PetscScalar x) { return x + shift_; }
1038: };
1040: template <typename Iterator>
1041: class strided_range {
1042: public:
1043: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
1044: struct stride_functor : public thrust::unary_function<difference_type, difference_type> {
1045: difference_type stride;
1046: stride_functor(difference_type stride) : stride(stride) { }
1047: __device__ difference_type operator()(const difference_type &i) const { return stride * i; }
1048: };
1049: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
1050: typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
1051: typedef typename thrust::permutation_iterator<Iterator, TransformIterator> PermutationIterator;
1052: typedef PermutationIterator iterator; // type of the strided_range iterator
1053: // construct strided_range for the range [first,last)
1054: strided_range(Iterator first, Iterator last, difference_type stride) : first(first), last(last), stride(stride) { }
1055: iterator begin(void) const { return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); }
1056: iterator end(void) const { return begin() + ((last - first) + (stride - 1)) / stride; }
1058: protected:
1059: Iterator first;
1060: Iterator last;
1061: difference_type stride;
1062: };
1064: PetscErrorCode MatShift_DenseCUDA_Private(PetscScalar *da, PetscScalar alpha, PetscInt lda, PetscInt rstart, PetscInt rend, PetscInt cols)
1065: {
1066: PetscInt rend2 = PetscMin(rend, cols);
1067: if (rend2 > rstart) {
1068: PetscLogGpuTimeBegin();
1069: try {
1070: const auto dptr = thrust::device_pointer_cast(da);
1071: size_t begin = rstart * lda;
1072: size_t end = rend2 - rstart + rend2 * lda;
1073: strided_range<thrust::device_vector<PetscScalar>::iterator> diagonal(dptr + begin, dptr + end, lda + 1);
1074: thrust::transform(diagonal.begin(), diagonal.end(), diagonal.begin(), petscshift(alpha));
1075: } catch (char *ex) {
1076: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "Thrust error: %s", ex);
1077: }
1078: PetscLogGpuTimeEnd();
1079: PetscLogGpuFlops(rend2 - rstart);
1080: }
1081: return 0;
1082: }
1084: PetscErrorCode MatShift_SeqDenseCUDA(Mat A, PetscScalar alpha)
1085: {
1086: PetscScalar *da;
1087: PetscInt m = A->rmap->n, n = A->cmap->n, lda;
1089: MatDenseCUDAGetArray(A, &da);
1090: MatDenseGetLDA(A, &lda);
1091: PetscInfo(A, "Performing Shift %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n);
1092: MatShift_DenseCUDA_Private(da, alpha, lda, 0, m, n);
1093: MatDenseCUDARestoreArray(A, &da);
1094: return 0;
1095: }
1097: PetscErrorCode MatAXPY_SeqDenseCUDA(Mat Y, PetscScalar alpha, Mat X, MatStructure str)
1098: {
1099: Mat_SeqDense *x = (Mat_SeqDense *)X->data;
1100: Mat_SeqDense *y = (Mat_SeqDense *)Y->data;
1101: const PetscScalar *dx;
1102: PetscScalar *dy;
1103: PetscCuBLASInt j, N, m, ldax, lday, one = 1;
1104: cublasHandle_t cublasv2handle;
1106: if (!X->rmap->n || !X->cmap->n) return 0;
1107: PetscCUBLASGetHandle(&cublasv2handle);
1108: MatDenseCUDAGetArrayRead(X, &dx);
1109: if (alpha == 0.0) MatDenseCUDAGetArrayWrite(Y, &dy);
1110: else MatDenseCUDAGetArray(Y, &dy);
1111: PetscCuBLASIntCast(X->rmap->n * X->cmap->n, &N);
1112: PetscCuBLASIntCast(X->rmap->n, &m);
1113: PetscCuBLASIntCast(x->lda, &ldax);
1114: PetscCuBLASIntCast(y->lda, &lday);
1115: PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", Y->rmap->n, Y->cmap->n);
1116: PetscLogGpuTimeBegin();
1117: if (ldax > m || lday > m) {
1118: for (j = 0; j < X->cmap->n; j++) cublasXaxpy(cublasv2handle, m, &alpha, dx + j * ldax, one, dy + j * lday, one);
1119: } else cublasXaxpy(cublasv2handle, N, &alpha, dx, one, dy, one);
1120: PetscLogGpuTimeEnd();
1121: PetscLogGpuFlops(PetscMax(2. * N - 1, 0));
1122: MatDenseCUDARestoreArrayRead(X, &dx);
1123: if (alpha == 0.0) MatDenseCUDARestoreArrayWrite(Y, &dy);
1124: else MatDenseCUDARestoreArray(Y, &dy);
1125: return 0;
1126: }
1128: static PetscErrorCode MatReset_SeqDenseCUDA(Mat A)
1129: {
1130: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1132: if (dA) {
1134: if (!dA->user_alloc) cudaFree(dA->d_v);
1135: cudaFree(dA->d_fact_tau);
1136: cudaFree(dA->d_fact_ipiv);
1137: cudaFree(dA->d_fact_info);
1138: cudaFree(dA->d_fact_work);
1139: VecDestroy(&dA->workvec);
1140: }
1141: PetscFree(A->spptr);
1142: return 0;
1143: }
1145: PetscErrorCode MatDestroy_SeqDenseCUDA(Mat A)
1146: {
1147: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1149: /* prevent to copy back data if we own the data pointer */
1150: if (!a->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
1151: MatConvert_SeqDenseCUDA_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A);
1152: MatDestroy_SeqDense(A);
1153: return 0;
1154: }
1156: PetscErrorCode MatDuplicate_SeqDenseCUDA(Mat A, MatDuplicateOption cpvalues, Mat *B)
1157: {
1158: MatDuplicateOption hcpvalues = (cpvalues == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : cpvalues;
1160: MatCreate(PetscObjectComm((PetscObject)A), B);
1161: MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1162: MatSetType(*B, ((PetscObject)A)->type_name);
1163: MatDuplicateNoCreate_SeqDense(*B, A, hcpvalues);
1164: if (cpvalues == MAT_COPY_VALUES && hcpvalues != MAT_COPY_VALUES) MatCopy_SeqDenseCUDA(A, *B, SAME_NONZERO_PATTERN);
1165: if (cpvalues != MAT_COPY_VALUES) { /* allocate memory if needed */
1166: Mat_SeqDenseCUDA *dB = (Mat_SeqDenseCUDA *)(*B)->spptr;
1167: if (!dB->d_v) MatSeqDenseCUDASetPreallocation(*B, NULL);
1168: }
1169: return 0;
1170: }
1172: static PetscErrorCode MatGetColumnVector_SeqDenseCUDA(Mat A, Vec v, PetscInt col)
1173: {
1174: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1175: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1176: PetscScalar *x;
1177: PetscBool viscuda;
1179: PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, VECCUDA, "");
1180: if (viscuda && !v->boundtocpu) { /* update device data */
1181: VecCUDAGetArrayWrite(v, &x);
1182: if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1183: cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToHost);
1184: } else {
1185: cudaMemcpy(x, a->v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyHostToDevice);
1186: }
1187: VecCUDARestoreArrayWrite(v, &x);
1188: } else { /* update host data */
1189: VecGetArrayWrite(v, &x);
1190: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask & PETSC_OFFLOAD_CPU) {
1191: PetscArraycpy(x, a->v + col * a->lda, A->rmap->n);
1192: } else if (A->offloadmask & PETSC_OFFLOAD_GPU) {
1193: cudaMemcpy(x, dA->d_v + col * a->lda, A->rmap->n * sizeof(PetscScalar), cudaMemcpyDeviceToHost);
1194: }
1195: VecRestoreArrayWrite(v, &x);
1196: }
1197: return 0;
1198: }
1200: PETSC_INTERN PetscErrorCode MatGetFactor_seqdense_cuda(Mat A, MatFactorType ftype, Mat *fact)
1201: {
1202: MatCreate(PetscObjectComm((PetscObject)A), fact);
1203: MatSetSizes(*fact, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n);
1204: MatSetType(*fact, MATSEQDENSECUDA);
1205: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU) {
1206: (*fact)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1207: (*fact)->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1208: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1209: (*fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1210: } else if (ftype == MAT_FACTOR_QR) {
1211: PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactor_C", MatQRFactor_SeqDense);
1212: PetscObjectComposeFunction((PetscObject)(*fact), "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense);
1213: }
1214: (*fact)->factortype = ftype;
1215: PetscFree((*fact)->solvertype);
1216: PetscStrallocpy(MATSOLVERCUDA, &(*fact)->solvertype);
1217: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_LU]);
1218: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ILU]);
1219: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_CHOLESKY]);
1220: PetscStrallocpy(MATORDERINGEXTERNAL, (char **)&(*fact)->preferredordering[MAT_FACTOR_ICC]);
1221: return 0;
1222: }
1224: static PetscErrorCode MatDenseGetColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1225: {
1226: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1230: MatDenseCUDAGetArray(A, (PetscScalar **)&a->ptrinuse);
1231: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1232: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1233: }
1234: a->vecinuse = col + 1;
1235: VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1236: *v = a->cvec;
1237: return 0;
1238: }
1240: static PetscErrorCode MatDenseRestoreColumnVec_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1241: {
1242: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1246: a->vecinuse = 0;
1247: VecCUDAResetArray(a->cvec);
1248: MatDenseCUDARestoreArray(A, (PetscScalar **)&a->ptrinuse);
1249: if (v) *v = NULL;
1250: return 0;
1251: }
1253: static PetscErrorCode MatDenseGetColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1254: {
1255: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1259: MatDenseCUDAGetArrayRead(A, &a->ptrinuse);
1260: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1261: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1262: }
1263: a->vecinuse = col + 1;
1264: VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1265: VecLockReadPush(a->cvec);
1266: *v = a->cvec;
1267: return 0;
1268: }
1270: static PetscErrorCode MatDenseRestoreColumnVecRead_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1271: {
1272: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1276: a->vecinuse = 0;
1277: VecLockReadPop(a->cvec);
1278: VecCUDAResetArray(a->cvec);
1279: MatDenseCUDARestoreArrayRead(A, &a->ptrinuse);
1280: if (v) *v = NULL;
1281: return 0;
1282: }
1284: static PetscErrorCode MatDenseGetColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1285: {
1286: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1290: MatDenseCUDAGetArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1291: if (!a->cvec) { /* we pass the data of A, to prevent allocating needless GPU memory the first time VecCUDAPlaceArray is called */
1292: VecCreateSeqCUDAWithArray(PetscObjectComm((PetscObject)A), A->rmap->bs, A->rmap->n, a->ptrinuse, &a->cvec);
1293: }
1294: a->vecinuse = col + 1;
1295: VecCUDAPlaceArray(a->cvec, a->ptrinuse + (size_t)col * (size_t)a->lda);
1296: *v = a->cvec;
1297: return 0;
1298: }
1300: static PetscErrorCode MatDenseRestoreColumnVecWrite_SeqDenseCUDA(Mat A, PetscInt col, Vec *v)
1301: {
1302: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1306: a->vecinuse = 0;
1307: VecCUDAResetArray(a->cvec);
1308: MatDenseCUDARestoreArrayWrite(A, (PetscScalar **)&a->ptrinuse);
1309: if (v) *v = NULL;
1310: return 0;
1311: }
1313: static PetscErrorCode MatDenseGetSubMatrix_SeqDenseCUDA(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *v)
1314: {
1315: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1316: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1320: if (a->cmat && (cend - cbegin != a->cmat->cmap->N || rend - rbegin != a->cmat->rmap->N)) MatDestroy(&a->cmat);
1321: MatSeqDenseCUDACopyToGPU(A);
1322: if (!a->cmat) {
1323: MatCreateDenseCUDA(PetscObjectComm((PetscObject)A), rend - rbegin, PETSC_DECIDE, rend - rbegin, cend - cbegin, dA->d_v + rbegin + (size_t)cbegin * a->lda, &a->cmat);
1324: } else {
1325: MatDenseCUDAPlaceArray(a->cmat, dA->d_v + rbegin + (size_t)cbegin * a->lda);
1326: }
1327: MatDenseSetLDA(a->cmat, a->lda);
1328: /* Place CPU array if present but not copy any data */
1329: a->cmat->offloadmask = PETSC_OFFLOAD_GPU;
1330: if (a->v) MatDensePlaceArray(a->cmat, a->v + rbegin + (size_t)cbegin * a->lda);
1331: a->cmat->offloadmask = A->offloadmask;
1332: a->matinuse = cbegin + 1;
1333: *v = a->cmat;
1334: return 0;
1335: }
1337: static PetscErrorCode MatDenseRestoreSubMatrix_SeqDenseCUDA(Mat A, Mat *v)
1338: {
1339: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1340: PetscBool copy = PETSC_FALSE, reset;
1341: PetscOffloadMask suboff;
1346: a->matinuse = 0;
1347: reset = a->v ? PETSC_TRUE : PETSC_FALSE;
1348: suboff = a->cmat->offloadmask; /* calls to ResetArray may change it, so save it here */
1349: if (suboff == PETSC_OFFLOAD_CPU && !a->v) {
1350: copy = PETSC_TRUE;
1351: MatSeqDenseSetPreallocation(A, NULL);
1352: }
1353: MatDenseCUDAResetArray(a->cmat);
1354: if (reset) MatDenseResetArray(a->cmat);
1355: if (copy) {
1356: MatSeqDenseCUDACopyFromGPU(A);
1357: } else A->offloadmask = (suboff == PETSC_OFFLOAD_CPU) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1358: a->cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1359: if (v) *v = NULL;
1360: return 0;
1361: }
1363: static PetscErrorCode MatDenseSetLDA_SeqDenseCUDA(Mat A, PetscInt lda)
1364: {
1365: Mat_SeqDense *cA = (Mat_SeqDense *)A->data;
1366: Mat_SeqDenseCUDA *dA = (Mat_SeqDenseCUDA *)A->spptr;
1367: PetscBool data;
1369: data = (PetscBool)((A->rmap->n > 0 && A->cmap->n > 0) ? (dA->d_v ? PETSC_TRUE : PETSC_FALSE) : PETSC_FALSE);
1372: cA->lda = lda;
1373: return 0;
1374: }
1376: static PetscErrorCode MatSetUp_SeqDenseCUDA(Mat A)
1377: {
1378: PetscLayoutSetUp(A->rmap);
1379: PetscLayoutSetUp(A->cmap);
1380: if (!A->preallocated) MatSeqDenseCUDASetPreallocation(A, NULL);
1381: return 0;
1382: }
1384: static PetscErrorCode MatSetRandom_SeqDenseCUDA(Mat A, PetscRandom r)
1385: {
1386: PetscBool iscurand;
1388: PetscObjectTypeCompare((PetscObject)r, PETSCCURAND, &iscurand);
1389: if (iscurand) {
1390: PetscScalar *a;
1391: PetscInt lda, m, n, mn = 0;
1393: MatGetSize(A, &m, &n);
1394: MatDenseGetLDA(A, &lda);
1395: MatDenseCUDAGetArrayWrite(A, &a);
1396: if (lda > m) {
1397: for (PetscInt i = 0; i < n; i++) PetscRandomGetValues(r, m, a + i * lda);
1398: } else {
1399: PetscIntMultError(m, n, &mn);
1400: PetscRandomGetValues(r, mn, a);
1401: }
1402: MatDenseCUDARestoreArrayWrite(A, &a);
1403: } else MatSetRandom_SeqDense(A, r);
1404: return 0;
1405: }
1407: static PetscErrorCode MatBindToCPU_SeqDenseCUDA(Mat A, PetscBool flg)
1408: {
1409: Mat_SeqDense *a = (Mat_SeqDense *)A->data;
1413: A->boundtocpu = flg;
1414: if (!flg) {
1415: PetscBool iscuda;
1417: PetscFree(A->defaultrandtype);
1418: PetscStrallocpy(PETSCCURAND, &A->defaultrandtype);
1419: PetscObjectTypeCompare((PetscObject)a->cvec, VECSEQCUDA, &iscuda);
1420: if (!iscuda) VecDestroy(&a->cvec);
1421: PetscObjectTypeCompare((PetscObject)a->cmat, MATSEQDENSECUDA, &iscuda);
1422: if (!iscuda) MatDestroy(&a->cmat);
1423: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDenseCUDA);
1424: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArrayRead_SeqDenseCUDA);
1425: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArrayWrite_SeqDenseCUDA);
1426: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDenseCUDA);
1427: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDenseCUDA);
1428: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDenseCUDA);
1429: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDenseCUDA);
1430: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDenseCUDA);
1431: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDenseCUDA);
1432: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDenseCUDA);
1433: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDenseCUDA);
1434: PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDenseCUDA);
1435: PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDenseCUDA);
1437: A->ops->duplicate = MatDuplicate_SeqDenseCUDA;
1438: A->ops->mult = MatMult_SeqDenseCUDA;
1439: A->ops->multadd = MatMultAdd_SeqDenseCUDA;
1440: A->ops->multtranspose = MatMultTranspose_SeqDenseCUDA;
1441: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDenseCUDA;
1442: A->ops->matmultnumeric = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1443: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1444: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA;
1445: A->ops->axpy = MatAXPY_SeqDenseCUDA;
1446: A->ops->choleskyfactor = MatCholeskyFactor_SeqDenseCUDA;
1447: A->ops->lufactor = MatLUFactor_SeqDenseCUDA;
1448: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDenseCUDA;
1449: A->ops->getcolumnvector = MatGetColumnVector_SeqDenseCUDA;
1450: A->ops->scale = MatScale_SeqDenseCUDA;
1451: A->ops->shift = MatShift_SeqDenseCUDA;
1452: A->ops->copy = MatCopy_SeqDenseCUDA;
1453: A->ops->zeroentries = MatZeroEntries_SeqDenseCUDA;
1454: A->ops->setup = MatSetUp_SeqDenseCUDA;
1455: A->ops->setrandom = MatSetRandom_SeqDenseCUDA;
1456: } else {
1457: /* make sure we have an up-to-date copy on the CPU */
1458: MatSeqDenseCUDACopyFromGPU(A);
1459: PetscFree(A->defaultrandtype);
1460: PetscStrallocpy(PETSCRANDER48, &A->defaultrandtype);
1461: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArray_C", MatDenseGetArray_SeqDense);
1462: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense);
1463: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense);
1464: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense);
1465: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense);
1466: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense);
1467: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense);
1468: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense);
1469: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense);
1470: PetscObjectComposeFunction((PetscObject)A, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense);
1471: PetscObjectComposeFunction((PetscObject)A, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense);
1472: PetscObjectComposeFunction((PetscObject)A, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense);
1473: PetscObjectComposeFunction((PetscObject)A, "MatQRFactor_C", MatQRFactor_SeqDense);
1475: A->ops->duplicate = MatDuplicate_SeqDense;
1476: A->ops->mult = MatMult_SeqDense;
1477: A->ops->multadd = MatMultAdd_SeqDense;
1478: A->ops->multtranspose = MatMultTranspose_SeqDense;
1479: A->ops->multtransposeadd = MatMultTransposeAdd_SeqDense;
1480: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1481: A->ops->matmultnumeric = MatMatMultNumeric_SeqDense_SeqDense;
1482: A->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqDense_SeqDense;
1483: A->ops->transposematmultnumeric = MatTransposeMatMultNumeric_SeqDense_SeqDense;
1484: A->ops->axpy = MatAXPY_SeqDense;
1485: A->ops->choleskyfactor = MatCholeskyFactor_SeqDense;
1486: A->ops->lufactor = MatLUFactor_SeqDense;
1487: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1488: A->ops->getcolumnvector = MatGetColumnVector_SeqDense;
1489: A->ops->scale = MatScale_SeqDense;
1490: A->ops->shift = MatShift_SeqDense;
1491: A->ops->copy = MatCopy_SeqDense;
1492: A->ops->zeroentries = MatZeroEntries_SeqDense;
1493: A->ops->setup = MatSetUp_SeqDense;
1494: A->ops->setrandom = MatSetRandom_SeqDense;
1495: }
1496: if (a->cmat) MatBindToCPU(a->cmat, flg);
1497: return 0;
1498: }
1500: PetscErrorCode MatConvert_SeqDenseCUDA_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1501: {
1502: Mat B;
1503: Mat_SeqDense *a;
1505: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1506: /* TODO these cases should be optimized */
1507: MatConvert_Basic(M, type, reuse, newmat);
1508: return 0;
1509: }
1511: B = *newmat;
1512: MatBindToCPU_SeqDenseCUDA(B, PETSC_TRUE);
1513: MatReset_SeqDenseCUDA(B);
1514: PetscFree(B->defaultvectype);
1515: PetscStrallocpy(VECSTANDARD, &B->defaultvectype);
1516: PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSE);
1517: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", NULL);
1518: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", NULL);
1519: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", NULL);
1520: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", NULL);
1521: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", NULL);
1522: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", NULL);
1523: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", NULL);
1524: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", NULL);
1525: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", NULL);
1526: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", NULL);
1527: PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", NULL);
1528: a = (Mat_SeqDense *)B->data;
1529: VecDestroy(&a->cvec); /* cvec might be VECSEQCUDA. Destroy it and rebuild a VECSEQ when needed */
1530: B->ops->bindtocpu = NULL;
1531: B->ops->destroy = MatDestroy_SeqDense;
1532: B->offloadmask = PETSC_OFFLOAD_CPU;
1533: return 0;
1534: }
1536: PetscErrorCode MatConvert_SeqDense_SeqDenseCUDA(Mat M, MatType type, MatReuse reuse, Mat *newmat)
1537: {
1538: Mat_SeqDenseCUDA *dB;
1539: Mat B;
1540: Mat_SeqDense *a;
1542: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1543: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
1544: /* TODO these cases should be optimized */
1545: MatConvert_Basic(M, type, reuse, newmat);
1546: return 0;
1547: }
1549: B = *newmat;
1550: PetscFree(B->defaultvectype);
1551: PetscStrallocpy(VECCUDA, &B->defaultvectype);
1552: PetscObjectChangeTypeName((PetscObject)B, MATSEQDENSECUDA);
1553: PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqdensecuda_seqdense_C", MatConvert_SeqDenseCUDA_SeqDense);
1554: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArray_C", MatDenseCUDAGetArray_SeqDenseCUDA);
1555: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayRead_C", MatDenseCUDAGetArrayRead_SeqDenseCUDA);
1556: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAGetArrayWrite_C", MatDenseCUDAGetArrayWrite_SeqDenseCUDA);
1557: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArray_C", MatDenseCUDARestoreArray_SeqDenseCUDA);
1558: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayRead_C", MatDenseCUDARestoreArrayRead_SeqDenseCUDA);
1559: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDARestoreArrayWrite_C", MatDenseCUDARestoreArrayWrite_SeqDenseCUDA);
1560: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAPlaceArray_C", MatDenseCUDAPlaceArray_SeqDenseCUDA);
1561: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAResetArray_C", MatDenseCUDAResetArray_SeqDenseCUDA);
1562: PetscObjectComposeFunction((PetscObject)B, "MatDenseCUDAReplaceArray_C", MatDenseCUDAReplaceArray_SeqDenseCUDA);
1563: PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_seqaij_seqdensecuda_C", MatProductSetFromOptions_SeqAIJ_SeqDense);
1564: a = (Mat_SeqDense *)B->data;
1565: VecDestroy(&a->cvec); /* cvec might be VECSEQ. Destroy it and rebuild a VECSEQCUDA when needed */
1566: PetscNew(&dB);
1568: B->spptr = dB;
1569: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1571: MatBindToCPU_SeqDenseCUDA(B, PETSC_FALSE);
1572: B->ops->bindtocpu = MatBindToCPU_SeqDenseCUDA;
1573: B->ops->destroy = MatDestroy_SeqDenseCUDA;
1574: return 0;
1575: }
1577: /*@C
1578: MatCreateSeqDenseCUDA - Creates a sequential matrix in dense format using CUDA.
1580: Collective
1582: Input Parameters:
1583: + comm - MPI communicator
1584: . m - number of rows
1585: . n - number of columns
1586: - data - optional location of GPU matrix data. Set data=NULL for PETSc
1587: to control matrix memory allocation.
1589: Output Parameter:
1590: . A - the matrix
1592: Level: intermediate
1594: .seealso: `MATSEQDENSE`, `MatCreate()`, `MatCreateSeqDense()`
1595: @*/
1596: PetscErrorCode MatCreateSeqDenseCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A)
1597: {
1598: PetscMPIInt size;
1600: MPI_Comm_size(comm, &size);
1602: MatCreate(comm, A);
1603: MatSetSizes(*A, m, n, m, n);
1604: MatSetType(*A, MATSEQDENSECUDA);
1605: MatSeqDenseCUDASetPreallocation(*A, data);
1606: return 0;
1607: }
1609: /*MC
1610: MATSEQDENSECUDA - MATSEQDENSECUDA = "seqdensecuda" - A matrix type to be used for sequential dense matrices on GPUs.
1612: Options Database Keys:
1613: . -mat_type seqdensecuda - sets the matrix type to `MATSEQDENSECUDA` during a call to MatSetFromOptions()
1615: Level: beginner
1617: .seealso: `MATSEQDENSE`
1618: M*/
1619: PETSC_EXTERN PetscErrorCode MatCreate_SeqDenseCUDA(Mat B)
1620: {
1621: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
1622: MatCreate_SeqDense(B);
1623: MatConvert_SeqDense_SeqDenseCUDA(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B);
1624: return 0;
1625: }