Actual source code: math2opus.cu
1: #include <h2opusconf.h>
2: /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
4: #include <h2opus.h>
5: #if defined(H2OPUS_USE_MPI)
6: #include <h2opus/distributed/distributed_h2opus_handle.h>
7: #include <h2opus/distributed/distributed_geometric_construction.h>
8: #include <h2opus/distributed/distributed_hgemv.h>
9: #include <h2opus/distributed/distributed_horthog.h>
10: #include <h2opus/distributed/distributed_hcompress.h>
11: #endif
12: #include <h2opus/util/boxentrygen.h>
13: #include <petsc/private/matimpl.h>
14: #include <petsc/private/vecimpl.h>
15: #include <petsc/private/deviceimpl.h>
16: #include <petscsf.h>
18: /* math2opusutils */
19: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusStridedSF(Mat, PetscSF, PetscSF *);
20: PETSC_INTERN PetscErrorCode VecSign(Vec, Vec);
21: PETSC_INTERN PetscErrorCode VecSetDelta(Vec, PetscInt);
22: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat, NormType, PetscInt, PetscReal *);
24: #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26: /* Use GPU only if H2OPUS is configured for GPU */
27: #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
28: #define PETSC_H2OPUS_USE_GPU
29: #endif
30: #if defined(PETSC_H2OPUS_USE_GPU)
31: #define MatH2OpusUpdateIfNeeded(A, B) MatBindToCPU(A, (PetscBool)((A)->boundtocpu || (B)))
32: #else
33: #define MatH2OpusUpdateIfNeeded(A, B) PETSC_SUCCESS
34: #endif
36: // TODO H2OPUS:
37: // DistributedHMatrix
38: // unsymmetric ?
39: // transpose for distributed_hgemv?
40: // clearData()
41: // Unify interface for sequential and parallel?
42: // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
43: //
44: template <class T>
45: class PetscPointCloud : public H2OpusDataSet<T> {
46: private:
47: int dimension;
48: size_t num_points;
49: std::vector<T> pts;
51: public:
52: PetscPointCloud(int dim, size_t num_pts, const T coords[])
53: {
54: dim = dim > 0 ? dim : 1;
55: this->dimension = dim;
56: this->num_points = num_pts;
58: pts.resize(num_pts * dim);
59: if (coords) {
60: for (size_t n = 0; n < num_pts; n++)
61: for (int i = 0; i < dim; i++) pts[n * dim + i] = coords[n * dim + i];
62: } else {
63: PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
64: for (size_t n = 0; n < num_pts; n++) {
65: pts[n * dim] = n * h;
66: for (int i = 1; i < dim; i++) pts[n * dim + i] = 0.0;
67: }
68: }
69: }
71: PetscPointCloud(const PetscPointCloud<T> &other)
72: {
73: size_t N = other.dimension * other.num_points;
74: this->dimension = other.dimension;
75: this->num_points = other.num_points;
76: this->pts.resize(N);
77: for (size_t i = 0; i < N; i++) this->pts[i] = other.pts[i];
78: }
80: int getDimension() const { return dimension; }
82: size_t getDataSetSize() const { return num_points; }
84: T getDataPoint(size_t idx, int dim) const
85: {
86: assert(dim < dimension && idx < num_points);
87: return pts[idx * dimension + dim];
88: }
90: void Print(std::ostream &out = std::cout)
91: {
92: out << "Dimension: " << dimension << std::endl;
93: out << "NumPoints: " << num_points << std::endl;
94: for (size_t n = 0; n < num_points; n++) {
95: for (int d = 0; d < dimension; d++) out << pts[n * dimension + d] << " ";
96: out << std::endl;
97: }
98: }
99: };
101: template <class T>
102: class PetscFunctionGenerator {
103: private:
104: MatH2OpusKernelFn *k;
105: int dim;
106: void *ctx;
108: public:
109: PetscFunctionGenerator(MatH2OpusKernelFn *k, int dim, void *ctx)
110: {
111: this->k = k;
112: this->dim = dim;
113: this->ctx = ctx;
114: }
115: PetscFunctionGenerator(PetscFunctionGenerator &other)
116: {
117: this->k = other.k;
118: this->dim = other.dim;
119: this->ctx = other.ctx;
120: }
121: T operator()(PetscReal *pt1, PetscReal *pt2) { return (T)((*this->k)(this->dim, pt1, pt2, this->ctx)); }
122: };
124: #include <../src/mat/impls/h2opus/math2opussampler.hpp>
126: /* just to not clutter the code */
127: #if !defined(H2OPUS_USE_GPU)
128: typedef HMatrix HMatrix_GPU;
129: #if defined(H2OPUS_USE_MPI)
130: typedef DistributedHMatrix DistributedHMatrix_GPU;
131: #endif
132: #endif
134: typedef struct {
135: #if defined(H2OPUS_USE_MPI)
136: distributedH2OpusHandle_t handle;
137: #else
138: h2opusHandle_t handle;
139: #endif
140: /* Sequential and parallel matrices are two different classes at the moment */
141: HMatrix *hmatrix;
142: #if defined(H2OPUS_USE_MPI)
143: DistributedHMatrix *dist_hmatrix;
144: #else
145: HMatrix *dist_hmatrix; /* just to not clutter the code */
146: #endif
147: /* May use permutations */
148: PetscSF sf;
149: PetscLayout h2opus_rmap, h2opus_cmap;
150: IS h2opus_indexmap;
151: thrust::host_vector<PetscScalar> *xx, *yy;
152: PetscInt xxs, yys;
153: PetscBool multsetup;
155: /* GPU */
156: HMatrix_GPU *hmatrix_gpu;
157: #if defined(H2OPUS_USE_MPI)
158: DistributedHMatrix_GPU *dist_hmatrix_gpu;
159: #else
160: HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
161: #endif
162: #if defined(PETSC_H2OPUS_USE_GPU)
163: thrust::device_vector<PetscScalar> *xx_gpu, *yy_gpu;
164: PetscInt xxs_gpu, yys_gpu;
165: #endif
167: /* construction from matvecs */
168: PetscMatrixSampler *sampler;
169: PetscBool nativemult;
171: /* Admissibility */
172: PetscReal eta;
173: PetscInt leafsize;
175: /* for dof reordering */
176: PetscPointCloud<PetscReal> *ptcloud;
178: /* kernel for generating matrix entries */
179: PetscFunctionGenerator<PetscScalar> *kernel;
181: /* basis orthogonalized? */
182: PetscBool orthogonal;
184: /* customization */
185: PetscInt basisord;
186: PetscInt max_rank;
187: PetscInt bs;
188: PetscReal rtol;
189: PetscInt norm_max_samples;
190: PetscBool check_construction;
191: PetscBool hara_verbose;
192: PetscBool resize;
194: /* keeps track of MatScale values */
195: PetscScalar s;
196: } Mat_H2OPUS;
198: static PetscErrorCode MatDestroy_H2OPUS(Mat A)
199: {
200: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
202: PetscFunctionBegin;
203: #if defined(H2OPUS_USE_MPI)
204: h2opusDestroyDistributedHandle(a->handle);
205: #else
206: h2opusDestroyHandle(a->handle);
207: #endif
208: delete a->dist_hmatrix;
209: delete a->hmatrix;
210: PetscCall(PetscSFDestroy(&a->sf));
211: PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
212: PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
213: PetscCall(ISDestroy(&a->h2opus_indexmap));
214: delete a->xx;
215: delete a->yy;
216: delete a->hmatrix_gpu;
217: delete a->dist_hmatrix_gpu;
218: #if defined(PETSC_H2OPUS_USE_GPU)
219: delete a->xx_gpu;
220: delete a->yy_gpu;
221: #endif
222: delete a->sampler;
223: delete a->ptcloud;
224: delete a->kernel;
225: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", NULL));
226: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", NULL));
227: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", NULL));
228: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", NULL));
229: PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
230: PetscCall(PetscFree(A->data));
231: PetscFunctionReturn(PETSC_SUCCESS);
232: }
234: PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
235: {
236: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
237: PetscBool ish2opus;
239: PetscFunctionBegin;
242: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
243: if (ish2opus) {
244: if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
245: if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
246: PetscLayout t;
247: t = A->rmap;
248: A->rmap = a->h2opus_rmap;
249: a->h2opus_rmap = t;
250: t = A->cmap;
251: A->cmap = a->h2opus_cmap;
252: a->h2opus_cmap = t;
253: }
254: }
255: a->nativemult = nm;
256: }
257: PetscFunctionReturn(PETSC_SUCCESS);
258: }
260: PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
261: {
262: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
263: PetscBool ish2opus;
265: PetscFunctionBegin;
267: PetscAssertPointer(nm, 2);
268: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
269: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
270: *nm = a->nativemult;
271: PetscFunctionReturn(PETSC_SUCCESS);
272: }
274: PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal *n)
275: {
276: PetscBool ish2opus;
277: PetscInt nmax = PETSC_DECIDE;
278: Mat_H2OPUS *a = NULL;
279: PetscBool mult = PETSC_FALSE;
281: PetscFunctionBegin;
282: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
283: if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
284: a = (Mat_H2OPUS *)A->data;
286: nmax = a->norm_max_samples;
287: mult = a->nativemult;
288: PetscCall(MatH2OpusSetNativeMult(A, PETSC_TRUE));
289: } else {
290: PetscCall(PetscOptionsGetInt(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_approximate_norm_samples", &nmax, NULL));
291: }
292: PetscCall(MatApproximateNorm_Private(A, normtype, nmax, n));
293: if (a) PetscCall(MatH2OpusSetNativeMult(A, mult));
294: PetscFunctionReturn(PETSC_SUCCESS);
295: }
297: static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
298: {
299: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
300: PetscInt n;
301: PetscBool boundtocpu = PETSC_TRUE;
303: PetscFunctionBegin;
304: #if defined(PETSC_H2OPUS_USE_GPU)
305: boundtocpu = A->boundtocpu;
306: #endif
307: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
308: if (boundtocpu) {
309: if (h2opus->xxs < xN) {
310: h2opus->xx->resize(n * xN);
311: h2opus->xxs = xN;
312: }
313: if (h2opus->yys < yN) {
314: h2opus->yy->resize(n * yN);
315: h2opus->yys = yN;
316: }
317: }
318: #if defined(PETSC_H2OPUS_USE_GPU)
319: if (!boundtocpu) {
320: if (h2opus->xxs_gpu < xN) {
321: h2opus->xx_gpu->resize(n * xN);
322: h2opus->xxs_gpu = xN;
323: }
324: if (h2opus->yys_gpu < yN) {
325: h2opus->yy_gpu->resize(n * yN);
326: h2opus->yys_gpu = yN;
327: }
328: }
329: #endif
330: PetscFunctionReturn(PETSC_SUCCESS);
331: }
333: static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
334: {
335: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
336: #if defined(H2OPUS_USE_MPI)
337: h2opusHandle_t handle = h2opus->handle->handle;
338: #else
339: h2opusHandle_t handle = h2opus->handle;
340: #endif
341: PetscBool boundtocpu = PETSC_TRUE;
342: PetscScalar *xx, *yy, *uxx, *uyy;
343: PetscInt blda, clda;
344: PetscMPIInt size;
345: PetscSF bsf, csf;
346: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
348: PetscFunctionBegin;
349: HLibProfile::clear();
350: #if defined(PETSC_H2OPUS_USE_GPU)
351: boundtocpu = A->boundtocpu;
352: #endif
353: PetscCall(MatDenseGetLDA(B, &blda));
354: PetscCall(MatDenseGetLDA(C, &clda));
355: if (usesf) {
356: PetscInt n;
358: PetscCall(MatDenseGetH2OpusStridedSF(B, h2opus->sf, &bsf));
359: PetscCall(MatDenseGetH2OpusStridedSF(C, h2opus->sf, &csf));
361: PetscCall(MatH2OpusResizeBuffers_Private(A, B->cmap->N, C->cmap->N));
362: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
363: blda = n;
364: clda = n;
365: }
366: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
367: if (boundtocpu) {
368: PetscCall(MatDenseGetArrayRead(B, (const PetscScalar **)&xx));
369: PetscCall(MatDenseGetArrayWrite(C, &yy));
370: if (usesf) {
371: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
372: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
373: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
374: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
375: } else {
376: uxx = xx;
377: uyy = yy;
378: }
379: if (size > 1) {
380: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
381: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
382: #if defined(H2OPUS_USE_MPI)
383: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
384: #endif
385: } else {
386: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
387: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
388: }
389: PetscCall(MatDenseRestoreArrayRead(B, (const PetscScalar **)&xx));
390: if (usesf) {
391: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
392: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
393: }
394: PetscCall(MatDenseRestoreArrayWrite(C, &yy));
395: #if defined(PETSC_H2OPUS_USE_GPU)
396: } else {
397: PetscBool ciscuda, biscuda;
399: /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
400: PetscCall(PetscObjectTypeCompareAny((PetscObject)B, &biscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
401: if (!biscuda) PetscCall(MatConvert(B, MATDENSECUDA, MAT_INPLACE_MATRIX, &B));
402: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &ciscuda, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
403: if (!ciscuda) {
404: C->assembled = PETSC_TRUE;
405: PetscCall(MatConvert(C, MATDENSECUDA, MAT_INPLACE_MATRIX, &C));
406: }
407: PetscCall(MatDenseCUDAGetArrayRead(B, (const PetscScalar **)&xx));
408: PetscCall(MatDenseCUDAGetArrayWrite(C, &yy));
409: if (usesf) {
410: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
411: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
412: PetscCall(PetscSFBcastBegin(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
413: PetscCall(PetscSFBcastEnd(bsf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
414: } else {
415: uxx = xx;
416: uyy = yy;
417: }
418: PetscCall(PetscLogGpuTimeBegin());
419: if (size > 1) {
420: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
421: PetscCheck(!transA || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
422: #if defined(H2OPUS_USE_MPI)
423: distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
424: #endif
425: } else {
426: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
427: hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
428: }
429: PetscCall(PetscLogGpuTimeEnd());
430: PetscCall(MatDenseCUDARestoreArrayRead(B, (const PetscScalar **)&xx));
431: if (usesf) {
432: PetscCall(PetscSFReduceBegin(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
433: PetscCall(PetscSFReduceEnd(csf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
434: }
435: PetscCall(MatDenseCUDARestoreArrayWrite(C, &yy));
436: if (!biscuda) PetscCall(MatConvert(B, MATDENSE, MAT_INPLACE_MATRIX, &B));
437: if (!ciscuda) PetscCall(MatConvert(C, MATDENSE, MAT_INPLACE_MATRIX, &C));
438: #endif
439: }
440: { /* log flops */
441: double gops, time, perf, dev;
442: HLibProfile::getHgemvPerf(gops, time, perf, dev);
443: #if defined(PETSC_H2OPUS_USE_GPU)
444: if (boundtocpu) {
445: PetscCall(PetscLogFlops(1e9 * gops));
446: } else {
447: PetscCall(PetscLogGpuFlops(1e9 * gops));
448: }
449: #else
450: PetscCall(PetscLogFlops(1e9 * gops));
451: #endif
452: }
453: PetscFunctionReturn(PETSC_SUCCESS);
454: }
456: static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
457: {
458: Mat_Product *product = C->product;
460: PetscFunctionBegin;
461: MatCheckProduct(C, 1);
462: switch (product->type) {
463: case MATPRODUCT_AB:
464: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_FALSE, product->B, C));
465: break;
466: case MATPRODUCT_AtB:
467: PetscCall(MatMultNKernel_H2OPUS(product->A, PETSC_TRUE, product->B, C));
468: break;
469: default:
470: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
471: }
472: PetscFunctionReturn(PETSC_SUCCESS);
473: }
475: static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
476: {
477: Mat_Product *product = C->product;
478: PetscBool cisdense;
479: Mat A, B;
481: PetscFunctionBegin;
482: MatCheckProduct(C, 1);
483: A = product->A;
484: B = product->B;
485: switch (product->type) {
486: case MATPRODUCT_AB:
487: PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
488: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
489: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
490: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
491: PetscCall(MatSetUp(C));
492: break;
493: case MATPRODUCT_AtB:
494: PetscCall(MatSetSizes(C, A->cmap->n, B->cmap->n, A->cmap->N, B->cmap->N));
495: PetscCall(MatSetBlockSizesFromMats(C, product->A, product->B));
496: PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATSEQDENSE, MATMPIDENSE, MATSEQDENSECUDA, MATMPIDENSECUDA, ""));
497: if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)product->B)->type_name));
498: PetscCall(MatSetUp(C));
499: break;
500: default:
501: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type %s is not supported", MatProductTypes[product->type]);
502: }
503: C->ops->productsymbolic = NULL;
504: C->ops->productnumeric = MatProductNumeric_H2OPUS;
505: PetscFunctionReturn(PETSC_SUCCESS);
506: }
508: static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
509: {
510: PetscFunctionBegin;
511: MatCheckProduct(C, 1);
512: if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
513: PetscFunctionReturn(PETSC_SUCCESS);
514: }
516: static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517: {
518: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
519: #if defined(H2OPUS_USE_MPI)
520: h2opusHandle_t handle = h2opus->handle->handle;
521: #else
522: h2opusHandle_t handle = h2opus->handle;
523: #endif
524: PetscBool boundtocpu = PETSC_TRUE;
525: PetscInt n;
526: PetscScalar *xx, *yy, *uxx, *uyy;
527: PetscMPIInt size;
528: PetscBool usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
530: PetscFunctionBegin;
531: HLibProfile::clear();
532: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
533: #if defined(PETSC_H2OPUS_USE_GPU)
534: boundtocpu = A->boundtocpu;
535: #endif
536: if (usesf) {
537: PetscCall(PetscSFGetGraph(h2opus->sf, NULL, &n, NULL, NULL));
538: } else n = A->rmap->n;
539: if (boundtocpu) {
540: PetscCall(VecGetArrayRead(x, (const PetscScalar **)&xx));
541: if (sy == 0.0) {
542: PetscCall(VecGetArrayWrite(y, &yy));
543: } else {
544: PetscCall(VecGetArray(y, &yy));
545: }
546: if (usesf) {
547: uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548: uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
550: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
551: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
552: if (sy != 0.0) {
553: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
554: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
555: }
556: } else {
557: uxx = xx;
558: uyy = yy;
559: }
560: if (size > 1) {
561: PetscCheck(h2opus->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
562: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
563: #if defined(H2OPUS_USE_MPI)
564: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565: #endif
566: } else {
567: PetscCheck(h2opus->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
568: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569: }
570: PetscCall(VecRestoreArrayRead(x, (const PetscScalar **)&xx));
571: if (usesf) {
572: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
573: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
574: }
575: if (sy == 0.0) {
576: PetscCall(VecRestoreArrayWrite(y, &yy));
577: } else {
578: PetscCall(VecRestoreArray(y, &yy));
579: }
580: #if defined(PETSC_H2OPUS_USE_GPU)
581: } else {
582: PetscCall(VecCUDAGetArrayRead(x, (const PetscScalar **)&xx));
583: if (sy == 0.0) {
584: PetscCall(VecCUDAGetArrayWrite(y, &yy));
585: } else {
586: PetscCall(VecCUDAGetArray(y, &yy));
587: }
588: if (usesf) {
589: uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590: uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
592: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
593: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, xx, uxx, MPI_REPLACE));
594: if (sy != 0.0) {
595: PetscCall(PetscSFBcastBegin(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
596: PetscCall(PetscSFBcastEnd(h2opus->sf, MPIU_SCALAR, yy, uyy, MPI_REPLACE));
597: }
598: } else {
599: uxx = xx;
600: uyy = yy;
601: }
602: PetscCall(PetscLogGpuTimeBegin());
603: if (size > 1) {
604: PetscCheck(h2opus->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed GPU matrix");
605: PetscCheck(!trans || A->symmetric, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "MatMultTranspose not yet coded in parallel");
606: #if defined(H2OPUS_USE_MPI)
607: distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */ h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608: #endif
609: } else {
610: PetscCheck(h2opus->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
611: hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612: }
613: PetscCall(PetscLogGpuTimeEnd());
614: PetscCall(VecCUDARestoreArrayRead(x, (const PetscScalar **)&xx));
615: if (usesf) {
616: PetscCall(PetscSFReduceBegin(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
617: PetscCall(PetscSFReduceEnd(h2opus->sf, MPIU_SCALAR, uyy, yy, MPI_REPLACE));
618: }
619: if (sy == 0.0) {
620: PetscCall(VecCUDARestoreArrayWrite(y, &yy));
621: } else {
622: PetscCall(VecCUDARestoreArray(y, &yy));
623: }
624: #endif
625: }
626: { /* log flops */
627: double gops, time, perf, dev;
628: HLibProfile::getHgemvPerf(gops, time, perf, dev);
629: #if defined(PETSC_H2OPUS_USE_GPU)
630: if (boundtocpu) {
631: PetscCall(PetscLogFlops(1e9 * gops));
632: } else {
633: PetscCall(PetscLogGpuFlops(1e9 * gops));
634: }
635: #else
636: PetscCall(PetscLogFlops(1e9 * gops));
637: #endif
638: }
639: PetscFunctionReturn(PETSC_SUCCESS);
640: }
642: static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643: {
644: PetscBool xiscuda, yiscuda;
646: PetscFunctionBegin;
647: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
648: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
649: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
650: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_TRUE));
651: PetscFunctionReturn(PETSC_SUCCESS);
652: }
654: static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655: {
656: PetscBool xiscuda, yiscuda;
658: PetscFunctionBegin;
659: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
660: PetscCall(PetscObjectTypeCompareAny((PetscObject)y, &yiscuda, VECSEQCUDA, VECMPICUDA, ""));
661: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !yiscuda));
662: PetscCall(MatMultKernel_H2OPUS(A, x, 0.0, y, PETSC_FALSE));
663: PetscFunctionReturn(PETSC_SUCCESS);
664: }
666: static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667: {
668: PetscBool xiscuda, ziscuda;
670: PetscFunctionBegin;
671: PetscCall(VecCopy(y, z));
672: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
673: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
674: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
675: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_TRUE));
676: PetscFunctionReturn(PETSC_SUCCESS);
677: }
679: static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680: {
681: PetscBool xiscuda, ziscuda;
683: PetscFunctionBegin;
684: PetscCall(VecCopy(y, z));
685: PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &xiscuda, VECSEQCUDA, VECMPICUDA, ""));
686: PetscCall(PetscObjectTypeCompareAny((PetscObject)z, &ziscuda, VECSEQCUDA, VECMPICUDA, ""));
687: PetscCall(MatH2OpusUpdateIfNeeded(A, !xiscuda || !ziscuda));
688: PetscCall(MatMultKernel_H2OPUS(A, x, 1.0, z, PETSC_FALSE));
689: PetscFunctionReturn(PETSC_SUCCESS);
690: }
692: static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693: {
694: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
696: PetscFunctionBegin;
697: a->s *= s;
698: PetscFunctionReturn(PETSC_SUCCESS);
699: }
701: static PetscErrorCode MatSetFromOptions_H2OPUS(Mat A, PetscOptionItems *PetscOptionsObject)
702: {
703: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
705: PetscFunctionBegin;
706: PetscOptionsHeadBegin(PetscOptionsObject, "H2OPUS options");
707: PetscCall(PetscOptionsInt("-mat_h2opus_leafsize", "Leaf size of cluster tree", NULL, a->leafsize, &a->leafsize, NULL));
708: PetscCall(PetscOptionsReal("-mat_h2opus_eta", "Admissibility condition tolerance", NULL, a->eta, &a->eta, NULL));
709: PetscCall(PetscOptionsInt("-mat_h2opus_order", "Basis order for off-diagonal sampling when constructed from kernel", NULL, a->basisord, &a->basisord, NULL));
710: PetscCall(PetscOptionsInt("-mat_h2opus_maxrank", "Maximum rank when constructed from matvecs", NULL, a->max_rank, &a->max_rank, NULL));
711: PetscCall(PetscOptionsInt("-mat_h2opus_samples", "Maximum number of samples to be taken concurrently when constructing from matvecs", NULL, a->bs, &a->bs, NULL));
712: PetscCall(PetscOptionsInt("-mat_h2opus_normsamples", "Maximum number of samples to be when estimating norms", NULL, a->norm_max_samples, &a->norm_max_samples, NULL));
713: PetscCall(PetscOptionsReal("-mat_h2opus_rtol", "Relative tolerance for construction from sampling", NULL, a->rtol, &a->rtol, NULL));
714: PetscCall(PetscOptionsBool("-mat_h2opus_check", "Check error when constructing from sampling during MatAssemblyEnd()", NULL, a->check_construction, &a->check_construction, NULL));
715: PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose", "Verbose output from hara construction", NULL, a->hara_verbose, &a->hara_verbose, NULL));
716: PetscCall(PetscOptionsBool("-mat_h2opus_resize", "Resize after compression", NULL, a->resize, &a->resize, NULL));
717: PetscOptionsHeadEnd();
718: PetscFunctionReturn(PETSC_SUCCESS);
719: }
721: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat, PetscInt, const PetscReal[], PetscBool, MatH2OpusKernelFn *, void *);
723: static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724: {
725: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
726: Vec c;
727: PetscInt spacedim;
728: const PetscScalar *coords;
730: PetscFunctionBegin;
731: if (a->ptcloud) PetscFunctionReturn(PETSC_SUCCESS);
732: PetscCall(PetscObjectQuery((PetscObject)A, "__math2opus_coords", (PetscObject *)&c));
733: if (!c && a->sampler) {
734: Mat S = a->sampler->GetSamplingMat();
736: PetscCall(PetscObjectQuery((PetscObject)S, "__math2opus_coords", (PetscObject *)&c));
737: }
738: if (!c) {
739: PetscCall(MatH2OpusSetCoords_H2OPUS(A, -1, NULL, PETSC_FALSE, NULL, NULL));
740: } else {
741: PetscCall(VecGetArrayRead(c, &coords));
742: PetscCall(VecGetBlockSize(c, &spacedim));
743: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, PETSC_FALSE, NULL, NULL));
744: PetscCall(VecRestoreArrayRead(c, &coords));
745: }
746: PetscFunctionReturn(PETSC_SUCCESS);
747: }
749: static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750: {
751: MPI_Comm comm;
752: PetscMPIInt size;
753: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
754: PetscInt n = 0, *idx = NULL;
755: int *iidx = NULL;
756: PetscCopyMode own;
757: PetscBool rid;
759: PetscFunctionBegin;
760: if (a->multsetup) PetscFunctionReturn(PETSC_SUCCESS);
761: if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
763: #if defined(PETSC_H2OPUS_USE_GPU)
764: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
765: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
766: a->xxs_gpu = 1;
767: a->yys_gpu = 1;
768: #endif
769: a->xx = new thrust::host_vector<PetscScalar>(n);
770: a->yy = new thrust::host_vector<PetscScalar>(n);
771: a->xxs = 1;
772: a->yys = 1;
773: } else {
774: IS is;
775: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
776: PetscCallMPI(MPI_Comm_size(comm, &size));
777: if (!a->h2opus_indexmap) {
778: if (size > 1) {
779: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
780: #if defined(H2OPUS_USE_MPI)
781: iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782: n = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783: #endif
784: } else {
785: iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786: n = a->hmatrix->u_basis_tree.index_map.size();
787: }
789: if (PetscDefined(USE_64BIT_INDICES)) {
790: PetscInt i;
792: own = PETSC_OWN_POINTER;
793: PetscCall(PetscMalloc1(n, &idx));
794: for (i = 0; i < n; i++) idx[i] = iidx[i];
795: } else {
796: own = PETSC_COPY_VALUES;
797: idx = (PetscInt *)iidx;
798: }
799: PetscCall(ISCreateGeneral(comm, n, idx, own, &is));
800: PetscCall(ISSetPermutation(is));
801: PetscCall(ISViewFromOptions(is, (PetscObject)A, "-mat_h2opus_indexmap_view"));
802: a->h2opus_indexmap = is;
803: }
804: PetscCall(ISGetLocalSize(a->h2opus_indexmap, &n));
805: PetscCall(ISGetIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
806: rid = (PetscBool)(n == A->rmap->n);
807: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &rid, 1, MPIU_BOOL, MPI_LAND, comm));
808: if (rid) PetscCall(ISIdentity(a->h2opus_indexmap, &rid));
809: if (!rid) {
810: if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
811: PetscCall(PetscLayoutCreate(comm, &a->h2opus_rmap));
812: PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap, n));
813: PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
814: PetscCall(PetscLayoutReference(a->h2opus_rmap, &a->h2opus_cmap));
815: }
816: PetscCall(PetscSFCreate(comm, &a->sf));
817: PetscCall(PetscSFSetGraphLayout(a->sf, A->rmap, n, NULL, PETSC_OWN_POINTER, idx));
818: PetscCall(PetscSFSetUp(a->sf));
819: PetscCall(PetscSFViewFromOptions(a->sf, (PetscObject)A, "-mat_h2opus_sf_view"));
820: #if defined(PETSC_H2OPUS_USE_GPU)
821: a->xx_gpu = new thrust::device_vector<PetscScalar>(n);
822: a->yy_gpu = new thrust::device_vector<PetscScalar>(n);
823: a->xxs_gpu = 1;
824: a->yys_gpu = 1;
825: #endif
826: a->xx = new thrust::host_vector<PetscScalar>(n);
827: a->yy = new thrust::host_vector<PetscScalar>(n);
828: a->xxs = 1;
829: a->yys = 1;
830: }
831: PetscCall(ISRestoreIndices(a->h2opus_indexmap, (const PetscInt **)&idx));
832: }
833: a->multsetup = PETSC_TRUE;
834: PetscFunctionReturn(PETSC_SUCCESS);
835: }
837: static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
838: {
839: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
840: #if defined(H2OPUS_USE_MPI)
841: h2opusHandle_t handle = a->handle->handle;
842: #else
843: h2opusHandle_t handle = a->handle;
844: #endif
845: PetscBool kernel = PETSC_FALSE;
846: PetscBool boundtocpu = PETSC_TRUE;
847: PetscBool samplingdone = PETSC_FALSE;
848: MPI_Comm comm;
849: PetscMPIInt size;
851: PetscFunctionBegin;
852: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
853: PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
854: PetscCheck(A->rmap->N == A->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
856: /* XXX */
857: a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
859: PetscCallMPI(MPI_Comm_size(comm, &size));
860: /* TODO REUSABILITY of geometric construction */
861: delete a->hmatrix;
862: delete a->dist_hmatrix;
863: #if defined(PETSC_H2OPUS_USE_GPU)
864: delete a->hmatrix_gpu;
865: delete a->dist_hmatrix_gpu;
866: #endif
867: a->orthogonal = PETSC_FALSE;
869: /* TODO: other? */
870: H2OpusBoxCenterAdmissibility adm(a->eta);
872: PetscCall(PetscLogEventBegin(MAT_H2Opus_Build, A, 0, 0, 0));
873: if (size > 1) {
874: #if defined(H2OPUS_USE_MPI)
875: a->dist_hmatrix = new DistributedHMatrix(A->rmap->n /* ,A->symmetric */);
876: #else
877: a->dist_hmatrix = NULL;
878: #endif
879: } else a->hmatrix = new HMatrix(A->rmap->n, A->symmetric == PETSC_BOOL3_TRUE);
880: PetscCall(MatH2OpusInferCoordinates_Private(A));
881: PetscCheck(a->ptcloud, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing pointcloud");
882: if (a->kernel) {
883: BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
884: if (size > 1) {
885: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
886: #if defined(H2OPUS_USE_MPI)
887: buildDistributedHMatrix(*a->dist_hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord, a->handle);
888: #endif
889: } else {
890: buildHMatrix(*a->hmatrix, a->ptcloud, adm, entry_gen, a->leafsize, a->basisord);
891: }
892: kernel = PETSC_TRUE;
893: } else {
894: PetscCheck(size <= 1, comm, PETSC_ERR_SUP, "Construction from sampling not supported in parallel");
895: buildHMatrixStructure(*a->hmatrix, a->ptcloud, a->leafsize, adm);
896: }
897: PetscCall(MatSetUpMultiply_H2OPUS(A));
899: #if defined(PETSC_H2OPUS_USE_GPU)
900: boundtocpu = A->boundtocpu;
901: if (!boundtocpu) {
902: if (size > 1) {
903: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing distributed CPU matrix");
904: #if defined(H2OPUS_USE_MPI)
905: a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
906: #endif
907: } else {
908: a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
909: }
910: }
911: #endif
912: if (size == 1) {
913: if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
914: PetscReal Anorm;
915: bool verbose;
917: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_hara_verbose", &a->hara_verbose, NULL));
918: verbose = a->hara_verbose;
919: PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(), NORM_2, a->norm_max_samples, &Anorm));
920: if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF, "Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n", a->max_rank, a->rtol * Anorm, a->rtol, Anorm, boundtocpu ? "CPU" : "GPU", a->bs));
921: if (a->sf && !a->nativemult) a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(), a->hmatrix->u_basis_tree.index_map.data());
922: a->sampler->SetStream(handle->getMainStream());
923: if (boundtocpu) {
924: a->sampler->SetGPUSampling(false);
925: hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
926: #if defined(PETSC_H2OPUS_USE_GPU)
927: } else {
928: a->sampler->SetGPUSampling(true);
929: hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */, a->rtol * Anorm, a->bs, handle, verbose);
930: #endif
931: }
932: samplingdone = PETSC_TRUE;
933: }
934: }
935: #if defined(PETSC_H2OPUS_USE_GPU)
936: if (!boundtocpu) {
937: delete a->hmatrix;
938: delete a->dist_hmatrix;
939: a->hmatrix = NULL;
940: a->dist_hmatrix = NULL;
941: }
942: A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
943: #endif
944: PetscCall(PetscLogEventEnd(MAT_H2Opus_Build, A, 0, 0, 0));
946: if (!a->s) a->s = 1.0;
947: A->assembled = PETSC_TRUE;
949: if (samplingdone) {
950: PetscBool check = a->check_construction;
951: PetscBool checke = PETSC_FALSE;
953: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check", &check, NULL));
954: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_check_explicit", &checke, NULL));
955: if (check) {
956: Mat E, Ae;
957: PetscReal n1, ni, n2;
958: PetscReal n1A, niA, n2A;
959: void (*normfunc)(void);
961: Ae = a->sampler->GetSamplingMat();
962: PetscCall(MatConvert(A, MATSHELL, MAT_INITIAL_MATRIX, &E));
963: PetscCall(MatShellSetOperation(E, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
964: PetscCall(MatAXPY(E, -1.0, Ae, DIFFERENT_NONZERO_PATTERN));
965: PetscCall(MatNorm(E, NORM_1, &n1));
966: PetscCall(MatNorm(E, NORM_INFINITY, &ni));
967: PetscCall(MatNorm(E, NORM_2, &n2));
968: if (checke) {
969: Mat eA, eE, eAe;
971: PetscCall(MatComputeOperator(A, MATAIJ, &eA));
972: PetscCall(MatComputeOperator(E, MATAIJ, &eE));
973: PetscCall(MatComputeOperator(Ae, MATAIJ, &eAe));
974: PetscCall(MatFilter(eA, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
975: PetscCall(MatFilter(eE, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
976: PetscCall(MatFilter(eAe, PETSC_SMALL, PETSC_FALSE, PETSC_FALSE));
977: PetscCall(PetscObjectSetName((PetscObject)eA, "H2Mat"));
978: PetscCall(MatView(eA, NULL));
979: PetscCall(PetscObjectSetName((PetscObject)eAe, "S"));
980: PetscCall(MatView(eAe, NULL));
981: PetscCall(PetscObjectSetName((PetscObject)eE, "H2Mat - S"));
982: PetscCall(MatView(eE, NULL));
983: PetscCall(MatDestroy(&eA));
984: PetscCall(MatDestroy(&eE));
985: PetscCall(MatDestroy(&eAe));
986: }
988: PetscCall(MatGetOperation(Ae, MATOP_NORM, &normfunc));
989: PetscCall(MatSetOperation(Ae, MATOP_NORM, (void (*)(void))MatNorm_H2OPUS));
990: PetscCall(MatNorm(Ae, NORM_1, &n1A));
991: PetscCall(MatNorm(Ae, NORM_INFINITY, &niA));
992: PetscCall(MatNorm(Ae, NORM_2, &n2A));
993: n1A = PetscMax(n1A, PETSC_SMALL);
994: n2A = PetscMax(n2A, PETSC_SMALL);
995: niA = PetscMax(niA, PETSC_SMALL);
996: PetscCall(MatSetOperation(Ae, MATOP_NORM, normfunc));
997: PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A), "MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n", (double)n1, (double)ni, (double)n2, (double)(n1 / n1A), (double)(ni / niA), (double)(n2 / n2A)));
998: PetscCall(MatDestroy(&E));
999: }
1000: a->sampler->SetSamplingMat(NULL);
1001: }
1002: PetscFunctionReturn(PETSC_SUCCESS);
1003: }
1005: static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1006: {
1007: PetscMPIInt size;
1008: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1010: PetscFunctionBegin;
1011: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1012: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet supported");
1013: a->hmatrix->clearData();
1014: #if defined(PETSC_H2OPUS_USE_GPU)
1015: if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1016: #endif
1017: PetscFunctionReturn(PETSC_SUCCESS);
1018: }
1020: static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1021: {
1022: Mat A;
1023: Mat_H2OPUS *a, *b = (Mat_H2OPUS *)B->data;
1024: #if defined(PETSC_H2OPUS_USE_GPU)
1025: PetscBool iscpu = PETSC_FALSE;
1026: #else
1027: PetscBool iscpu = PETSC_TRUE;
1028: #endif
1029: MPI_Comm comm;
1031: PetscFunctionBegin;
1032: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1033: PetscCall(MatCreate(comm, &A));
1034: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1035: PetscCall(MatSetType(A, MATH2OPUS));
1036: PetscCall(MatPropagateSymmetryOptions(B, A));
1037: a = (Mat_H2OPUS *)A->data;
1039: a->eta = b->eta;
1040: a->leafsize = b->leafsize;
1041: a->basisord = b->basisord;
1042: a->max_rank = b->max_rank;
1043: a->bs = b->bs;
1044: a->rtol = b->rtol;
1045: a->norm_max_samples = b->norm_max_samples;
1046: if (op == MAT_COPY_VALUES) a->s = b->s;
1048: a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1049: if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1051: #if defined(H2OPUS_USE_MPI)
1052: if (b->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix);
1053: #if defined(PETSC_H2OPUS_USE_GPU)
1054: if (b->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu);
1055: #endif
1056: #endif
1057: if (b->hmatrix) {
1058: a->hmatrix = new HMatrix(*b->hmatrix);
1059: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1060: }
1061: #if defined(PETSC_H2OPUS_USE_GPU)
1062: if (b->hmatrix_gpu) {
1063: a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1064: if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1065: }
1066: #endif
1067: if (b->sf) {
1068: PetscCall(PetscObjectReference((PetscObject)b->sf));
1069: a->sf = b->sf;
1070: }
1071: if (b->h2opus_indexmap) {
1072: PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1073: a->h2opus_indexmap = b->h2opus_indexmap;
1074: }
1076: PetscCall(MatSetUp(A));
1077: PetscCall(MatSetUpMultiply_H2OPUS(A));
1078: if (op == MAT_COPY_VALUES) {
1079: A->assembled = PETSC_TRUE;
1080: a->orthogonal = b->orthogonal;
1081: #if defined(PETSC_H2OPUS_USE_GPU)
1082: A->offloadmask = B->offloadmask;
1083: #endif
1084: }
1085: #if defined(PETSC_H2OPUS_USE_GPU)
1086: iscpu = B->boundtocpu;
1087: #endif
1088: PetscCall(MatBindToCPU(A, iscpu));
1090: *nA = A;
1091: PetscFunctionReturn(PETSC_SUCCESS);
1092: }
1094: static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1095: {
1096: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1097: PetscBool isascii, vieweps;
1098: PetscMPIInt size;
1099: PetscViewerFormat format;
1101: PetscFunctionBegin;
1102: PetscCall(PetscObjectTypeCompare((PetscObject)view, PETSCVIEWERASCII, &isascii));
1103: PetscCall(PetscViewerGetFormat(view, &format));
1104: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1105: if (isascii) {
1106: if (format == PETSC_VIEWER_ASCII_MATLAB) {
1107: if (size == 1) {
1108: FILE *fp;
1109: PetscCall(PetscViewerASCIIGetPointer(view, &fp));
1110: dumpHMatrix(*h2opus->hmatrix, 6, fp);
1111: }
1112: } else {
1113: PetscCall(PetscViewerASCIIPrintf(view, " H-Matrix constructed from %s\n", h2opus->kernel ? "Kernel" : "Mat"));
1114: PetscCall(PetscViewerASCIIPrintf(view, " PointCloud dim %" PetscInt_FMT "\n", h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1115: PetscCall(PetscViewerASCIIPrintf(view, " Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n", h2opus->leafsize, (double)h2opus->eta));
1116: if (!h2opus->kernel) {
1117: PetscCall(PetscViewerASCIIPrintf(view, " Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n", h2opus->max_rank, h2opus->bs, (double)h2opus->rtol));
1118: } else {
1119: PetscCall(PetscViewerASCIIPrintf(view, " Off-diagonal blocks approximation order %" PetscInt_FMT "\n", h2opus->basisord));
1120: }
1121: PetscCall(PetscViewerASCIIPrintf(view, " Number of samples for norms %" PetscInt_FMT "\n", h2opus->norm_max_samples));
1122: if (size == 1) {
1123: double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1124: double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1125: #if defined(PETSC_HAVE_CUDA)
1126: double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1127: double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1128: #endif
1129: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1130: #if defined(PETSC_HAVE_CUDA)
1131: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1132: #endif
1133: } else {
1134: #if defined(PETSC_HAVE_CUDA)
1135: double matrix_mem[4] = {0., 0., 0., 0.};
1136: PetscMPIInt rsize = 4;
1137: #else
1138: double matrix_mem[2] = {0., 0.};
1139: PetscMPIInt rsize = 2;
1140: #endif
1141: #if defined(H2OPUS_USE_MPI)
1142: matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1143: matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1144: #if defined(PETSC_HAVE_CUDA)
1145: matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1146: matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1147: #endif
1148: #endif
1149: PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, matrix_mem, rsize, MPI_DOUBLE_PRECISION, MPI_SUM, PetscObjectComm((PetscObject)A)));
1150: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1151: #if defined(PETSC_HAVE_CUDA)
1152: PetscCall(PetscViewerASCIIPrintf(view, " Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1153: #endif
1154: }
1155: }
1156: }
1157: vieweps = PETSC_FALSE;
1158: PetscCall(PetscOptionsGetBool(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps", &vieweps, NULL));
1159: if (vieweps) {
1160: char filename[256];
1161: const char *name;
1163: PetscCall(PetscObjectGetName((PetscObject)A, &name));
1164: PetscCall(PetscSNPrintf(filename, sizeof(filename), "%s_structure.eps", name));
1165: PetscCall(PetscOptionsGetString(((PetscObject)A)->options, ((PetscObject)A)->prefix, "-mat_h2opus_vieweps_filename", filename, sizeof(filename), NULL));
1166: outputEps(*h2opus->hmatrix, filename);
1167: }
1168: PetscFunctionReturn(PETSC_SUCCESS);
1169: }
1171: static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx)
1172: {
1173: Mat_H2OPUS *h2opus = (Mat_H2OPUS *)A->data;
1174: PetscReal *gcoords;
1175: PetscInt N;
1176: MPI_Comm comm;
1177: PetscMPIInt size;
1178: PetscBool cong;
1180: PetscFunctionBegin;
1181: PetscCall(PetscLayoutSetUp(A->rmap));
1182: PetscCall(PetscLayoutSetUp(A->cmap));
1183: PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1184: PetscCall(MatHasCongruentLayouts(A, &cong));
1185: PetscCheck(cong, comm, PETSC_ERR_SUP, "Only for square matrices with congruent layouts");
1186: N = A->rmap->N;
1187: PetscCallMPI(MPI_Comm_size(comm, &size));
1188: if (spacedim > 0 && size > 1 && cdist) {
1189: PetscSF sf;
1190: MPI_Datatype dtype;
1192: PetscCallMPI(MPI_Type_contiguous(spacedim, MPIU_REAL, &dtype));
1193: PetscCallMPI(MPI_Type_commit(&dtype));
1195: PetscCall(PetscSFCreate(comm, &sf));
1196: PetscCall(PetscSFSetGraphWithPattern(sf, A->rmap, PETSCSF_PATTERN_ALLGATHER));
1197: PetscCall(PetscMalloc1(spacedim * N, &gcoords));
1198: PetscCall(PetscSFBcastBegin(sf, dtype, coords, gcoords, MPI_REPLACE));
1199: PetscCall(PetscSFBcastEnd(sf, dtype, coords, gcoords, MPI_REPLACE));
1200: PetscCall(PetscSFDestroy(&sf));
1201: PetscCallMPI(MPI_Type_free(&dtype));
1202: } else gcoords = (PetscReal *)coords;
1204: delete h2opus->ptcloud;
1205: delete h2opus->kernel;
1206: h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim, N, gcoords);
1207: if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel, spacedim, kernelctx);
1208: if (gcoords != coords) PetscCall(PetscFree(gcoords));
1209: A->preallocated = PETSC_TRUE;
1210: PetscFunctionReturn(PETSC_SUCCESS);
1211: }
1213: #if defined(PETSC_H2OPUS_USE_GPU)
1214: static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1215: {
1216: PetscMPIInt size;
1217: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1219: PetscFunctionBegin;
1220: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1221: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1222: if (size > 1) {
1223: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1224: #if defined(H2OPUS_USE_MPI)
1225: if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1226: else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1227: #endif
1228: } else {
1229: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1230: if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1231: else *a->hmatrix = *a->hmatrix_gpu;
1232: }
1233: delete a->hmatrix_gpu;
1234: delete a->dist_hmatrix_gpu;
1235: a->hmatrix_gpu = NULL;
1236: a->dist_hmatrix_gpu = NULL;
1237: A->offloadmask = PETSC_OFFLOAD_CPU;
1238: } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1239: if (size > 1) {
1240: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1241: #if defined(H2OPUS_USE_MPI)
1242: if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1243: else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1244: #endif
1245: } else {
1246: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1247: if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1248: else *a->hmatrix_gpu = *a->hmatrix;
1249: }
1250: delete a->hmatrix;
1251: delete a->dist_hmatrix;
1252: a->hmatrix = NULL;
1253: a->dist_hmatrix = NULL;
1254: A->offloadmask = PETSC_OFFLOAD_GPU;
1255: }
1256: PetscCall(PetscFree(A->defaultvectype));
1257: if (!flg) {
1258: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1259: } else {
1260: PetscCall(PetscStrallocpy(VECSTANDARD, &A->defaultvectype));
1261: }
1262: A->boundtocpu = flg;
1263: PetscFunctionReturn(PETSC_SUCCESS);
1264: }
1265: #endif
1267: /*MC
1268: MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package {cite}`zampinibouakaramturkiyyahkniokeyes2022`.
1270: Options Database Key:
1271: . -mat_type h2opus - matrix type to "h2opus"
1273: Level: beginner
1275: Notes:
1276: H2Opus implements hierarchical matrices in the $H^2$ flavour. It supports CPU or NVIDIA GPUs.
1278: For CPU only builds, use `./configure --download-h2opus --download-thrust` to install PETSc to use H2Opus.
1279: In order to run on NVIDIA GPUs, use `./configure --download-h2opus --download-magma --download-kblas`.
1281: .seealso: [](ch_matrices), `Mat`, `MATH2OPUS`, `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1282: M*/
1283: PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1284: {
1285: Mat_H2OPUS *a;
1286: PetscMPIInt size;
1288: PetscFunctionBegin;
1289: #if defined(PETSC_H2OPUS_USE_GPU)
1290: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1291: #endif
1292: PetscCall(PetscNew(&a));
1293: A->data = (void *)a;
1295: a->eta = 0.9;
1296: a->leafsize = 32;
1297: a->basisord = 4;
1298: a->max_rank = 64;
1299: a->bs = 32;
1300: a->rtol = 1.e-4;
1301: a->s = 1.0;
1302: a->norm_max_samples = 10;
1303: a->resize = PETSC_TRUE; /* reallocate after compression */
1304: #if defined(H2OPUS_USE_MPI)
1305: h2opusCreateDistributedHandleComm(&a->handle, PetscObjectComm((PetscObject)A));
1306: #else
1307: h2opusCreateHandle(&a->handle);
1308: #endif
1309: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1310: PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATH2OPUS));
1311: PetscCall(PetscMemzero(A->ops, sizeof(struct _MatOps)));
1313: A->ops->destroy = MatDestroy_H2OPUS;
1314: A->ops->view = MatView_H2OPUS;
1315: A->ops->assemblyend = MatAssemblyEnd_H2OPUS;
1316: A->ops->mult = MatMult_H2OPUS;
1317: A->ops->multtranspose = MatMultTranspose_H2OPUS;
1318: A->ops->multadd = MatMultAdd_H2OPUS;
1319: A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1320: A->ops->scale = MatScale_H2OPUS;
1321: A->ops->duplicate = MatDuplicate_H2OPUS;
1322: A->ops->setfromoptions = MatSetFromOptions_H2OPUS;
1323: A->ops->norm = MatNorm_H2OPUS;
1324: A->ops->zeroentries = MatZeroEntries_H2OPUS;
1325: #if defined(PETSC_H2OPUS_USE_GPU)
1326: A->ops->bindtocpu = MatBindToCPU_H2OPUS;
1327: #endif
1329: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdense_C", MatProductSetFromOptions_H2OPUS));
1330: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_seqdensecuda_C", MatProductSetFromOptions_H2OPUS));
1331: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidense_C", MatProductSetFromOptions_H2OPUS));
1332: PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_h2opus_mpidensecuda_C", MatProductSetFromOptions_H2OPUS));
1333: #if defined(PETSC_H2OPUS_USE_GPU)
1334: PetscCall(PetscFree(A->defaultvectype));
1335: PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
1336: #endif
1337: PetscFunctionReturn(PETSC_SUCCESS);
1338: }
1340: /*@
1341: MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1343: Input Parameter:
1344: . A - the matrix
1346: Level: intermediate
1348: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1349: @*/
1350: PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1351: {
1352: PetscBool ish2opus;
1353: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1354: PetscMPIInt size;
1355: PetscBool boundtocpu = PETSC_TRUE;
1357: PetscFunctionBegin;
1360: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1361: if (!ish2opus) PetscFunctionReturn(PETSC_SUCCESS);
1362: if (a->orthogonal) PetscFunctionReturn(PETSC_SUCCESS);
1363: HLibProfile::clear();
1364: PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog, A, 0, 0, 0));
1365: #if defined(PETSC_H2OPUS_USE_GPU)
1366: boundtocpu = A->boundtocpu;
1367: #endif
1368: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1369: if (size > 1) {
1370: if (boundtocpu) {
1371: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1372: #if defined(H2OPUS_USE_MPI)
1373: distributed_horthog(*a->dist_hmatrix, a->handle);
1374: #endif
1375: #if defined(PETSC_H2OPUS_USE_GPU)
1376: A->offloadmask = PETSC_OFFLOAD_CPU;
1377: } else {
1378: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1379: PetscCall(PetscLogGpuTimeBegin());
1380: #if defined(H2OPUS_USE_MPI)
1381: distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1382: #endif
1383: PetscCall(PetscLogGpuTimeEnd());
1384: #endif
1385: }
1386: } else {
1387: #if defined(H2OPUS_USE_MPI)
1388: h2opusHandle_t handle = a->handle->handle;
1389: #else
1390: h2opusHandle_t handle = a->handle;
1391: #endif
1392: if (boundtocpu) {
1393: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1394: horthog(*a->hmatrix, handle);
1395: #if defined(PETSC_H2OPUS_USE_GPU)
1396: A->offloadmask = PETSC_OFFLOAD_CPU;
1397: } else {
1398: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1399: PetscCall(PetscLogGpuTimeBegin());
1400: horthog(*a->hmatrix_gpu, handle);
1401: PetscCall(PetscLogGpuTimeEnd());
1402: #endif
1403: }
1404: }
1405: a->orthogonal = PETSC_TRUE;
1406: { /* log flops */
1407: double gops, time, perf, dev;
1408: HLibProfile::getHorthogPerf(gops, time, perf, dev);
1409: #if defined(PETSC_H2OPUS_USE_GPU)
1410: if (boundtocpu) {
1411: PetscCall(PetscLogFlops(1e9 * gops));
1412: } else {
1413: PetscCall(PetscLogGpuFlops(1e9 * gops));
1414: }
1415: #else
1416: PetscCall(PetscLogFlops(1e9 * gops));
1417: #endif
1418: }
1419: PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog, A, 0, 0, 0));
1420: PetscFunctionReturn(PETSC_SUCCESS);
1421: }
1423: /*@
1424: MatH2OpusCompress - Compress a hierarchical matrix.
1426: Input Parameters:
1427: + A - the matrix
1428: - tol - the absolute truncation threshold
1430: Level: intermediate
1432: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1433: @*/
1434: PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1435: {
1436: PetscBool ish2opus;
1437: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1438: PetscMPIInt size;
1439: PetscBool boundtocpu = PETSC_TRUE;
1441: PetscFunctionBegin;
1445: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1446: if (!ish2opus || tol <= 0.0) PetscFunctionReturn(PETSC_SUCCESS);
1447: PetscCall(MatH2OpusOrthogonalize(A));
1448: HLibProfile::clear();
1449: PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress, A, 0, 0, 0));
1450: #if defined(PETSC_H2OPUS_USE_GPU)
1451: boundtocpu = A->boundtocpu;
1452: #endif
1453: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1454: if (size > 1) {
1455: if (boundtocpu) {
1456: PetscCheck(a->dist_hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1457: #if defined(H2OPUS_USE_MPI)
1458: distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1459: if (a->resize) {
1460: DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1461: delete a->dist_hmatrix;
1462: a->dist_hmatrix = dist_hmatrix;
1463: }
1464: #endif
1465: #if defined(PETSC_H2OPUS_USE_GPU)
1466: A->offloadmask = PETSC_OFFLOAD_CPU;
1467: } else {
1468: PetscCheck(a->dist_hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1469: PetscCall(PetscLogGpuTimeBegin());
1470: #if defined(H2OPUS_USE_MPI)
1471: distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1473: if (a->resize) {
1474: DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1475: delete a->dist_hmatrix_gpu;
1476: a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1477: }
1478: #endif
1479: PetscCall(PetscLogGpuTimeEnd());
1480: #endif
1481: }
1482: } else {
1483: #if defined(H2OPUS_USE_MPI)
1484: h2opusHandle_t handle = a->handle->handle;
1485: #else
1486: h2opusHandle_t handle = a->handle;
1487: #endif
1488: if (boundtocpu) {
1489: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1490: hcompress(*a->hmatrix, tol, handle);
1492: if (a->resize) {
1493: HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1494: delete a->hmatrix;
1495: a->hmatrix = hmatrix;
1496: }
1497: #if defined(PETSC_H2OPUS_USE_GPU)
1498: A->offloadmask = PETSC_OFFLOAD_CPU;
1499: } else {
1500: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1501: PetscCall(PetscLogGpuTimeBegin());
1502: hcompress(*a->hmatrix_gpu, tol, handle);
1503: PetscCall(PetscLogGpuTimeEnd());
1505: if (a->resize) {
1506: HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1507: delete a->hmatrix_gpu;
1508: a->hmatrix_gpu = hmatrix_gpu;
1509: }
1510: #endif
1511: }
1512: }
1513: { /* log flops */
1514: double gops, time, perf, dev;
1515: HLibProfile::getHcompressPerf(gops, time, perf, dev);
1516: #if defined(PETSC_H2OPUS_USE_GPU)
1517: if (boundtocpu) {
1518: PetscCall(PetscLogFlops(1e9 * gops));
1519: } else {
1520: PetscCall(PetscLogGpuFlops(1e9 * gops));
1521: }
1522: #else
1523: PetscCall(PetscLogFlops(1e9 * gops));
1524: #endif
1525: }
1526: PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress, A, 0, 0, 0));
1527: PetscFunctionReturn(PETSC_SUCCESS);
1528: }
1530: /*@
1531: MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix-vector products on another matrix to construct a hierarchical matrix.
1533: Input Parameters:
1534: + A - the hierarchical matrix
1535: . B - the matrix to be sampled
1536: . bs - maximum number of samples to be taken concurrently
1537: - tol - relative tolerance for construction
1539: Level: intermediate
1541: Notes:
1542: You need to call `MatAssemblyBegin()` and `MatAssemblyEnd()` to update the hierarchical matrix.
1544: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1545: @*/
1546: PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1547: {
1548: PetscBool ish2opus;
1550: PetscFunctionBegin;
1556: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1557: if (ish2opus) {
1558: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1560: if (!a->sampler) a->sampler = new PetscMatrixSampler();
1561: a->sampler->SetSamplingMat(B);
1562: if (bs > 0) a->bs = bs;
1563: if (tol > 0.) a->rtol = tol;
1564: delete a->kernel;
1565: }
1566: PetscFunctionReturn(PETSC_SUCCESS);
1567: }
1569: /*@C
1570: MatCreateH2OpusFromKernel - Creates a `MATH2OPUS` from a user-supplied kernel.
1572: Input Parameters:
1573: + comm - MPI communicator
1574: . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1575: . n - number of local columns (or `PETSC_DECIDE` to have calculated if `N` is given)
1576: . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1577: . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1578: . spacedim - dimension of the space coordinates
1579: . coords - coordinates of the points
1580: . cdist - whether or not coordinates are distributed
1581: . kernel - computational kernel (or `NULL`)
1582: . kernelctx - kernel context
1583: . eta - admissibility condition tolerance
1584: . leafsize - leaf size in cluster tree
1585: - basisord - approximation order for Chebychev interpolation of low-rank blocks
1587: Output Parameter:
1588: . nA - matrix
1590: Options Database Keys:
1591: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1592: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1593: . -mat_h2opus_order <`PetscInt`> - Chebychev approximation order
1594: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be used when estimating norms
1596: Level: intermediate
1598: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1599: @*/
1600: PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernelFn *kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat *nA)
1601: {
1602: Mat A;
1603: Mat_H2OPUS *h2opus;
1604: #if defined(PETSC_H2OPUS_USE_GPU)
1605: PetscBool iscpu = PETSC_FALSE;
1606: #else
1607: PetscBool iscpu = PETSC_TRUE;
1608: #endif
1610: PetscFunctionBegin;
1611: PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1612: PetscCall(MatCreate(comm, &A));
1613: PetscCall(MatSetSizes(A, m, n, M, N));
1614: PetscCheck(M == N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1615: PetscCall(MatSetType(A, MATH2OPUS));
1616: PetscCall(MatBindToCPU(A, iscpu));
1617: PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, kernel, kernelctx));
1619: h2opus = (Mat_H2OPUS *)A->data;
1620: if (eta > 0.) h2opus->eta = eta;
1621: if (leafsize > 0) h2opus->leafsize = leafsize;
1622: if (basisord > 0) h2opus->basisord = basisord;
1624: *nA = A;
1625: PetscFunctionReturn(PETSC_SUCCESS);
1626: }
1628: /*@
1629: MatCreateH2OpusFromMat - Creates a `MATH2OPUS` sampling from a user-supplied operator.
1631: Input Parameters:
1632: + B - the matrix to be sampled
1633: . spacedim - dimension of the space coordinates
1634: . coords - coordinates of the points
1635: . cdist - whether or not coordinates are distributed
1636: . eta - admissibility condition tolerance
1637: . leafsize - leaf size in cluster tree
1638: . maxrank - maximum rank allowed
1639: . bs - maximum number of samples to be taken concurrently
1640: - rtol - relative tolerance for construction
1642: Output Parameter:
1643: . nA - matrix
1645: Options Database Keys:
1646: + -mat_h2opus_leafsize <`PetscInt`> - Leaf size of cluster tree
1647: . -mat_h2opus_eta <`PetscReal`> - Admissibility condition tolerance
1648: . -mat_h2opus_maxrank <`PetscInt`> - Maximum rank when constructed from matvecs
1649: . -mat_h2opus_samples <`PetscInt`> - Maximum number of samples to be taken concurrently when constructing from matvecs
1650: . -mat_h2opus_rtol <`PetscReal`> - Relative tolerance for construction from sampling
1651: . -mat_h2opus_check <`PetscBool`> - Check error when constructing from sampling during MatAssemblyEnd()
1652: . -mat_h2opus_hara_verbose <`PetscBool`> - Verbose output from hara construction
1653: - -mat_h2opus_normsamples <`PetscInt`> - Maximum number of samples to be when estimating norms
1655: Level: intermediate
1657: Note:
1658: Not available in parallel
1660: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1661: @*/
1662: PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1663: {
1664: Mat A;
1665: Mat_H2OPUS *h2opus;
1666: MPI_Comm comm;
1667: PetscBool boundtocpu = PETSC_TRUE;
1669: PetscFunctionBegin;
1678: PetscAssertPointer(nA, 10);
1679: PetscCall(PetscObjectGetComm((PetscObject)B, &comm));
1680: PetscCheck(B->rmap->n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Different row and column local sizes are not supported");
1681: PetscCheck(B->rmap->N == B->cmap->N, comm, PETSC_ERR_SUP, "Rectangular matrices are not supported");
1682: PetscCall(MatCreate(comm, &A));
1683: PetscCall(MatSetSizes(A, B->rmap->n, B->cmap->n, B->rmap->N, B->cmap->N));
1684: #if defined(PETSC_H2OPUS_USE_GPU)
1685: {
1686: VecType vtype;
1687: PetscBool isstd, iscuda, iskok;
1689: PetscCall(MatGetVecType(B, &vtype));
1690: PetscCall(PetscStrcmpAny(vtype, &isstd, VECSTANDARD, VECSEQ, VECMPI, ""));
1691: PetscCall(PetscStrcmpAny(vtype, &iscuda, VECCUDA, VECSEQCUDA, VECMPICUDA, ""));
1692: PetscCall(PetscStrcmpAny(vtype, &iskok, VECKOKKOS, VECSEQKOKKOS, VECMPIKOKKOS, ""));
1693: PetscCheck(isstd || iscuda || iskok, comm, PETSC_ERR_SUP, "Not for type %s", vtype);
1694: if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1695: if (iskok && PetscDefined(HAVE_MACRO_KOKKOS_ENABLE_CUDA)) boundtocpu = PETSC_FALSE;
1696: }
1697: #endif
1698: PetscCall(MatSetType(A, MATH2OPUS));
1699: PetscCall(MatBindToCPU(A, boundtocpu));
1700: if (spacedim) PetscCall(MatH2OpusSetCoords_H2OPUS(A, spacedim, coords, cdist, NULL, NULL));
1701: PetscCall(MatPropagateSymmetryOptions(B, A));
1702: /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1704: h2opus = (Mat_H2OPUS *)A->data;
1705: h2opus->sampler = new PetscMatrixSampler(B);
1706: if (eta > 0.) h2opus->eta = eta;
1707: if (leafsize > 0) h2opus->leafsize = leafsize;
1708: if (maxrank > 0) h2opus->max_rank = maxrank;
1709: if (bs > 0) h2opus->bs = bs;
1710: if (rtol > 0.) h2opus->rtol = rtol;
1711: *nA = A;
1712: A->preallocated = PETSC_TRUE;
1713: PetscFunctionReturn(PETSC_SUCCESS);
1714: }
1716: /*@
1717: MatH2OpusGetIndexMap - Access reordering index set.
1719: Input Parameter:
1720: . A - the matrix
1722: Output Parameter:
1723: . indexmap - the index set for the reordering
1725: Level: intermediate
1727: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1728: @*/
1729: PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1730: {
1731: PetscBool ish2opus;
1732: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1734: PetscFunctionBegin;
1737: PetscAssertPointer(indexmap, 2);
1738: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1739: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1740: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1741: *indexmap = a->h2opus_indexmap;
1742: PetscFunctionReturn(PETSC_SUCCESS);
1743: }
1745: /*@
1746: MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1748: Input Parameters:
1749: + A - the matrix
1750: . nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1751: - in - the vector to be mapped
1753: Output Parameter:
1754: . out - the newly created mapped vector
1756: Level: intermediate
1758: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1759: @*/
1760: PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec *out)
1761: {
1762: PetscBool ish2opus;
1763: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1764: PetscScalar *xin, *xout;
1765: PetscBool nm;
1767: PetscFunctionBegin;
1772: PetscAssertPointer(out, 4);
1773: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1774: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &ish2opus));
1775: PetscCheck(ish2opus, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
1776: nm = a->nativemult;
1777: PetscCall(MatH2OpusSetNativeMult(A, (PetscBool)!nativetopetsc));
1778: PetscCall(MatCreateVecs(A, out, NULL));
1779: PetscCall(MatH2OpusSetNativeMult(A, nm));
1780: if (!a->sf) { /* same ordering */
1781: PetscCall(VecCopy(in, *out));
1782: PetscFunctionReturn(PETSC_SUCCESS);
1783: }
1784: PetscCall(VecGetArrayRead(in, (const PetscScalar **)&xin));
1785: PetscCall(VecGetArrayWrite(*out, &xout));
1786: if (nativetopetsc) {
1787: PetscCall(PetscSFReduceBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1788: PetscCall(PetscSFReduceEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1789: } else {
1790: PetscCall(PetscSFBcastBegin(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1791: PetscCall(PetscSFBcastEnd(a->sf, MPIU_SCALAR, xin, xout, MPI_REPLACE));
1792: }
1793: PetscCall(VecRestoreArrayRead(in, (const PetscScalar **)&xin));
1794: PetscCall(VecRestoreArrayWrite(*out, &xout));
1795: PetscFunctionReturn(PETSC_SUCCESS);
1796: }
1798: /*@
1799: MatH2OpusLowRankUpdate - Perform a low-rank update of the form $ A = A + s * U * V^T $
1801: Input Parameters:
1802: + A - the hierarchical `MATH2OPUS` matrix
1803: . s - the scaling factor
1804: . U - the dense low-rank update matrix
1805: - V - (optional) the dense low-rank update matrix (if `NULL`, then `V` = `U` is assumed)
1807: Note:
1808: The `U` and `V` matrices must be in `MATDENSE` dense format
1810: Level: intermediate
1812: .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1813: @*/
1814: PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1815: {
1816: PetscBool flg;
1818: PetscFunctionBegin;
1821: PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
1823: PetscCheckSameComm(A, 1, U, 2);
1824: if (V) {
1826: PetscCheckSameComm(A, 1, V, 3);
1827: }
1830: if (!V) V = U;
1831: PetscCheck(U->cmap->N == V->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT, U->cmap->N, V->cmap->N);
1832: if (!U->cmap->N) PetscFunctionReturn(PETSC_SUCCESS);
1833: PetscCall(PetscLayoutCompare(U->rmap, A->rmap, &flg));
1834: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A and U must have the same row layout");
1835: PetscCall(PetscLayoutCompare(V->rmap, A->cmap, &flg));
1836: PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "A column layout must match V row column layout");
1837: PetscCall(PetscObjectTypeCompare((PetscObject)A, MATH2OPUS, &flg));
1838: if (flg) {
1839: Mat_H2OPUS *a = (Mat_H2OPUS *)A->data;
1840: const PetscScalar *u, *v, *uu, *vv;
1841: PetscInt ldu, ldv;
1842: PetscMPIInt size;
1843: #if defined(H2OPUS_USE_MPI)
1844: h2opusHandle_t handle = a->handle->handle;
1845: #else
1846: h2opusHandle_t handle = a->handle;
1847: #endif
1848: PetscBool usesf = (PetscBool)(a->sf && !a->nativemult);
1849: PetscSF usf, vsf;
1851: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
1852: PetscCheck(size <= 1, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not yet implemented in parallel");
1853: PetscCall(PetscLogEventBegin(MAT_H2Opus_LR, A, 0, 0, 0));
1854: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1855: PetscCheck(flg, PetscObjectComm((PetscObject)U), PETSC_ERR_SUP, "Not for U of type %s", ((PetscObject)U)->type_name);
1856: PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V, &flg, MATSEQDENSE, MATMPIDENSE, ""));
1857: PetscCheck(flg, PetscObjectComm((PetscObject)V), PETSC_ERR_SUP, "Not for V of type %s", ((PetscObject)V)->type_name);
1858: PetscCall(MatDenseGetLDA(U, &ldu));
1859: PetscCall(MatDenseGetLDA(V, &ldv));
1860: PetscCall(MatBoundToCPU(A, &flg));
1861: if (usesf) {
1862: PetscInt n;
1864: PetscCall(MatDenseGetH2OpusStridedSF(U, a->sf, &usf));
1865: PetscCall(MatDenseGetH2OpusStridedSF(V, a->sf, &vsf));
1866: PetscCall(MatH2OpusResizeBuffers_Private(A, U->cmap->N, V->cmap->N));
1867: PetscCall(PetscSFGetGraph(a->sf, NULL, &n, NULL, NULL));
1868: ldu = n;
1869: ldv = n;
1870: }
1871: if (flg) {
1872: PetscCheck(a->hmatrix, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing CPU matrix");
1873: PetscCall(MatDenseGetArrayRead(U, &u));
1874: PetscCall(MatDenseGetArrayRead(V, &v));
1875: if (usesf) {
1876: vv = MatH2OpusGetThrustPointer(*a->yy);
1877: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1878: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1879: if (U != V) {
1880: uu = MatH2OpusGetThrustPointer(*a->xx);
1881: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1882: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1883: } else uu = vv;
1884: } else {
1885: uu = u;
1886: vv = v;
1887: }
1888: hlru_global(*a->hmatrix, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1889: PetscCall(MatDenseRestoreArrayRead(U, &u));
1890: PetscCall(MatDenseRestoreArrayRead(V, &v));
1891: } else {
1892: #if defined(PETSC_H2OPUS_USE_GPU)
1893: PetscBool flgU, flgV;
1895: PetscCheck(a->hmatrix_gpu, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing GPU matrix");
1896: PetscCall(PetscObjectTypeCompareAny((PetscObject)U, &flgU, MATSEQDENSE, MATMPIDENSE, ""));
1897: if (flgU) PetscCall(MatConvert(U, MATDENSECUDA, MAT_INPLACE_MATRIX, &U));
1898: PetscCall(PetscObjectTypeCompareAny((PetscObject)V, &flgV, MATSEQDENSE, MATMPIDENSE, ""));
1899: if (flgV) PetscCall(MatConvert(V, MATDENSECUDA, MAT_INPLACE_MATRIX, &V));
1900: PetscCall(MatDenseCUDAGetArrayRead(U, &u));
1901: PetscCall(MatDenseCUDAGetArrayRead(V, &v));
1902: if (usesf) {
1903: vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1904: PetscCall(PetscSFBcastBegin(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1905: PetscCall(PetscSFBcastEnd(vsf, MPIU_SCALAR, v, (PetscScalar *)vv, MPI_REPLACE));
1906: if (U != V) {
1907: uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1908: PetscCall(PetscSFBcastBegin(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1909: PetscCall(PetscSFBcastEnd(usf, MPIU_SCALAR, u, (PetscScalar *)uu, MPI_REPLACE));
1910: } else uu = vv;
1911: } else {
1912: uu = u;
1913: vv = v;
1914: }
1915: #else
1916: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "This should not happen");
1917: #endif
1918: hlru_global(*a->hmatrix_gpu, uu, ldu, vv, ldv, U->cmap->N, s, handle);
1919: #if defined(PETSC_H2OPUS_USE_GPU)
1920: PetscCall(MatDenseCUDARestoreArrayRead(U, &u));
1921: PetscCall(MatDenseCUDARestoreArrayRead(V, &v));
1922: if (flgU) PetscCall(MatConvert(U, MATDENSE, MAT_INPLACE_MATRIX, &U));
1923: if (flgV) PetscCall(MatConvert(V, MATDENSE, MAT_INPLACE_MATRIX, &V));
1924: #endif
1925: }
1926: PetscCall(PetscLogEventEnd(MAT_H2Opus_LR, A, 0, 0, 0));
1927: a->orthogonal = PETSC_FALSE;
1928: }
1929: PetscFunctionReturn(PETSC_SUCCESS);
1930: }
1931: #endif